From 701fe571ecb77cd5b1d3d99607b46d18f75a4659 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Wed, 31 Jan 2024 22:18:25 +0000 Subject: [PATCH 01/32] PSMDB-1284 implement $_backupFile aggregation stage --- src/mongo/db/pipeline/SConscript | 1 + .../pipeline/document_source_backup_file.cpp | 187 ++++++++++++++++++ .../db/pipeline/document_source_backup_file.h | 139 +++++++++++++ 3 files changed, 327 insertions(+) create mode 100644 src/mongo/db/pipeline/document_source_backup_file.cpp create mode 100644 src/mongo/db/pipeline/document_source_backup_file.h diff --git a/src/mongo/db/pipeline/SConscript b/src/mongo/db/pipeline/SConscript index a523cfb93b986..4a05b280c9ad7 100644 --- a/src/mongo/db/pipeline/SConscript +++ b/src/mongo/db/pipeline/SConscript @@ -244,6 +244,7 @@ env.Library( source=[ 'document_source_backup_cursor.cpp', 'document_source_backup_cursor_extend.cpp', + 'document_source_backup_file.cpp', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/server_base', diff --git a/src/mongo/db/pipeline/document_source_backup_file.cpp b/src/mongo/db/pipeline/document_source_backup_file.cpp new file mode 100644 index 0000000000000..a18dd12782d90 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_backup_file.cpp @@ -0,0 +1,187 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery + +#include "mongo/db/pipeline/document_source_backup_file.h" + +#include +#include +#include +#include + +#include "mongo/base/data_range.h" +#include "mongo/base/error_codes.h" +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonmisc.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsontypes.h" +#include "mongo/db/exec/document_value/document.h" +#include "mongo/db/namespace_string.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/intrusive_counter.h" +#include "mongo/util/str.h" +#include "mongo/util/uuid.h" + +namespace mongo { + +namespace { +constexpr StringData kBackupId = "backupId"_sd; +constexpr StringData kFile = "file"_sd; +constexpr StringData kByteOffset = "byteOffset"_sd; + +// We only link this file into mongod so this stage doesn't exist in mongos +REGISTER_INTERNAL_DOCUMENT_SOURCE(_backupFile, + DocumentSourceBackupFile::LiteParsed::parse, + DocumentSourceBackupFile::createFromBson, + true); +} // namespace + +using boost::intrusive_ptr; + +std::unique_ptr DocumentSourceBackupFile::LiteParsed::parse( + const NamespaceString& nss, const BSONElement& spec) { + + return std::make_unique(spec.fieldName()); +} + +const char* DocumentSourceBackupFile::getSourceName() const { + return kStageName.rawData(); +} + +Value DocumentSourceBackupFile::serialize(const SerializationOptions& opts) const { + return Value{Document{{getSourceName(), + Document{{kBackupId, Value(_backupId)}, + {kFile, Value(_filePath)}, + {kByteOffset, Value(_byteOffset)}}}}}; +} + +DocumentSource::GetNextResult DocumentSourceBackupFile::doGetNext() { + if (_file.eof()) { + return GetNextResult::makeEOF(); + } + + auto byteOffset = _file.tellg(); + _file.read(_dataBuf.data(), kBlockSize); + uassert(ErrorCodes::FileStreamFailed, + str::stream() << "Error reading file " << _filePath << " at offset " << byteOffset, + !_file.bad()); + auto bytesRead = _file.gcount(); + auto eof = _file.eof(); + + Document doc; + doc = Document{{"byteOffset"_sd, static_cast(byteOffset)}, + {"data"_sd, BSONBinData(_dataBuf.data(), bytesRead, BinDataGeneral)}, + {"endOfFile"_sd, eof}}; + + return doc; +} + +intrusive_ptr DocumentSourceBackupFile::createFromBson( + BSONElement spec, const intrusive_ptr& pExpCtx) { + // This cursor is non-tailable so we don't touch pExpCtx->tailableMode here + + uassert(ErrorCodes::FailedToParse, + str::stream() << kStageName << " parameters must be specified in an object, but found: " + << typeName(spec.type()), + spec.type() == Object); + + auto backupId = UUID::fromCDR(std::array{}); + std::string filePath; + long long byteOffset = 0; + + for (auto&& elem : spec.embeddedObject()) { + const auto fieldName = elem.fieldNameStringData(); + + if (fieldName == kBackupId) { + uassert(ErrorCodes::TypeMismatch, + str::stream() << "The '" << fieldName << "' parameter of the " << kStageName + << " stage must be a binary data value, but found: " + << typeName(elem.type()), + elem.type() == BSONType::BinData); + backupId = uassertStatusOK(UUID::parse(elem)); + } else if (fieldName == kFile) { + uassert(ErrorCodes::TypeMismatch, + str::stream() << "The '" << fieldName << "' parameter of the " << kStageName + << " stage must be a string value, but found: " + << typeName(elem.type()), + elem.type() == BSONType::String); + filePath = elem.String(); + } else if (fieldName == kByteOffset) { + uassert(ErrorCodes::TypeMismatch, + str::stream() << "The '" << fieldName << "' parameter of the " << kStageName + << " stage must be a long integer value, but found: " + << typeName(elem.type()), + elem.type() == BSONType::NumberLong); + byteOffset = elem.Long(); + } else { + uasserted(ErrorCodes::FailedToParse, + str::stream() << "Unrecognized option '" << fieldName << "' in " << kStageName + << " stage"); + } + } + + uassert(ErrorCodes::InvalidOptions, + str::stream() << "'" << kByteOffset << "' parameter cannot be less than zero", + byteOffset >= 0); + + std::ifstream iFile(filePath, std::ios_base::in | std::ios_base::binary); + uassert(ErrorCodes::FileOpenFailed, + str::stream() << "Failed to open file " << filePath, + iFile.is_open()); + iFile.seekg(byteOffset); + uassert(ErrorCodes::FileOpenFailed, + str::stream() << "Failed to set read position " << byteOffset << " in file " + << filePath, + !iFile.fail()); + invariant(byteOffset == iFile.tellg()); + + return make_intrusive( + pExpCtx, backupId, std::move(filePath), byteOffset, std::move(iFile)); +} + +DocumentSourceBackupFile::DocumentSourceBackupFile(const intrusive_ptr& expCtx, + UUID backupId, + std::string filePath, + long long byteOffset, + std::ifstream file) + : DocumentSource(kStageName, expCtx), + _dataBuf(), + _backupId(backupId), + _filePath(std::move(filePath)), + _byteOffset(byteOffset), + _file(std::move(file)) {} + +DocumentSourceBackupFile::~DocumentSourceBackupFile() { + _file.close(); +} + +} // namespace mongo diff --git a/src/mongo/db/pipeline/document_source_backup_file.h b/src/mongo/db/pipeline/document_source_backup_file.h new file mode 100644 index 0000000000000..2fbd0e7b0a820 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_backup_file.h @@ -0,0 +1,139 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonelement.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/pipeline/document_source.h" +#include "mongo/db/pipeline/lite_parsed_document_source.h" +#include "mongo/stdx/unordered_set.h" +#include "mongo/util/uuid.h" + +namespace mongo { + +class DocumentSourceBackupFile final : public DocumentSource { +public: + static constexpr StringData kStageName = "$_backupFile"_sd; + + class LiteParsed final : public LiteParsedDocumentSource { + public: + using LiteParsedDocumentSource::LiteParsedDocumentSource; + + static std::unique_ptr parse(const NamespaceString& nss, + const BSONElement& spec); + + stdx::unordered_set getInvolvedNamespaces() const final { + return {}; + } + + PrivilegeVector requiredPrivileges( + [[maybe_unused]] bool isMongos, + [[maybe_unused]] bool bypassDocumentValidation) const final { + return {Privilege(ResourcePattern::forClusterResource(), ActionType::fsync)}; + } + + bool isInitialSource() const final { + return true; + } + + void assertSupportsMultiDocumentTransaction() const final { + transactionNotSupported(kStageName); + } + }; + + /** + * Parses a $_backupFile stage from 'spec'. + */ + static boost::intrusive_ptr createFromBson( + BSONElement spec, const boost::intrusive_ptr& pCtx); + + DocumentSourceBackupFile(const boost::intrusive_ptr& expCtx, + UUID backupId, + std::string filePath, + long long byteOffset, + std::ifstream file); + + DocumentSourceBackupFile(const DocumentSourceBackupFile&) = delete; + DocumentSourceBackupFile& operator=(const DocumentSourceBackupFile&) = delete; + DocumentSourceBackupFile(DocumentSourceBackupFile&&) = delete; + DocumentSourceBackupFile& operator=(DocumentSourceBackupFile&&) = delete; + + ~DocumentSourceBackupFile() override; + + const char* getSourceName() const override; + + StageConstraints constraints([[maybe_unused]] Pipeline::SplitState pipeState) const override { + StageConstraints constraints{StreamType::kStreaming, + PositionRequirement::kFirst, + HostTypeRequirement::kNone, + DiskUseRequirement::kNoDiskUse, + FacetRequirement::kNotAllowed, + TransactionRequirement::kNotAllowed, + LookupRequirement::kAllowed, + UnionRequirement::kNotAllowed, + ChangeStreamRequirement::kDenylist}; + constraints.isIndependentOfAnyCollection = true; + constraints.requiresInputDocSource = false; + return constraints; + } + + Value serialize(const SerializationOptions& opts = SerializationOptions()) const final; + + boost::optional distributedPlanLogic() final { + return boost::none; + } + + void addVariableRefs(std::set* refs) const final {} + +protected: + GetNextResult doGetNext() override; + +private: + static constexpr std::streamsize kBlockSize = 1 << 20; + + std::array _dataBuf; + const UUID _backupId; + const std::string _filePath; + const long long _byteOffset; + std::ifstream _file; +}; + +} // namespace mongo From a0ccbc0cde04f6191d001f125df787d7f912a7da Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 2 Feb 2024 20:08:54 +0000 Subject: [PATCH 02/32] PSMDB-1284 specify read concern capability of $backupCursor & $backupCursorExtend When something is called by internal client read concern must be explicitly specified. --- src/mongo/db/pipeline/document_source_backup_cursor.h | 7 ++++++- .../db/pipeline/document_source_backup_cursor_extend.h | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mongo/db/pipeline/document_source_backup_cursor.h b/src/mongo/db/pipeline/document_source_backup_cursor.h index 0d112cc67121b..2fa35c3fa56d0 100644 --- a/src/mongo/db/pipeline/document_source_backup_cursor.h +++ b/src/mongo/db/pipeline/document_source_backup_cursor.h @@ -59,7 +59,12 @@ class DocumentSourceBackupCursor : public DocumentSource { return true; } - void assertSupportsMultiDocumentTransaction() const { + ReadConcernSupportResult supportsReadConcern(repl::ReadConcernLevel level, + bool isImplicitDefault) const final { + return onlyReadConcernLocalSupported(kStageName, level, isImplicitDefault); + } + + void assertSupportsMultiDocumentTransaction() const final { transactionNotSupported(kStageName); } }; diff --git a/src/mongo/db/pipeline/document_source_backup_cursor_extend.h b/src/mongo/db/pipeline/document_source_backup_cursor_extend.h index b92040d9d9fd3..1905a1bd5d940 100644 --- a/src/mongo/db/pipeline/document_source_backup_cursor_extend.h +++ b/src/mongo/db/pipeline/document_source_backup_cursor_extend.h @@ -59,7 +59,12 @@ class DocumentSourceBackupCursorExtend : public DocumentSource { return true; } - void assertSupportsMultiDocumentTransaction() const { + ReadConcernSupportResult supportsReadConcern(repl::ReadConcernLevel level, + bool isImplicitDefault) const final { + return onlyReadConcernLocalSupported(kStageName, level, isImplicitDefault); + } + + void assertSupportsMultiDocumentTransaction() const final { transactionNotSupported(kStageName); } }; From 554ad1e1ca91e388394aa15968c39d6d23bbc126 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 12 Apr 2024 21:15:27 +0100 Subject: [PATCH 03/32] PSMDB-1284 Clone InitialSyncer to InitialSyncerFCB with fileCopyBased name --- src/mongo/db/repl/SConscript | 1 + src/mongo/db/repl/initial_syncer_fcb.cpp | 2269 ++++++++++++++++++++++ src/mongo/db/repl/initial_syncer_fcb.h | 726 +++++++ 3 files changed, 2996 insertions(+) create mode 100644 src/mongo/db/repl/initial_syncer_fcb.cpp create mode 100644 src/mongo/db/repl/initial_syncer_fcb.h diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index dcb7643d0a192..dd0e56c9cc2d8 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1254,6 +1254,7 @@ env.Library( 'initial_syncer.cpp', 'initial_syncer_common_stats.cpp', 'initial_syncer_factory.cpp', + 'initial_syncer_fcb.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/client/clientdriver_network', diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp new file mode 100644 index 0000000000000..c4c30b2527927 --- /dev/null +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -0,0 +1,2269 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + +#include "initial_syncer_fcb.h" + +#include +#include +#include +#include +#include + +#include "mongo/base/error_codes.h" +#include "mongo/base/status.h" +#include "mongo/bson/bsonmisc.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/bson/timestamp.h" +#include "mongo/client/fetcher.h" +#include "mongo/client/remote_command_retry_scheduler.h" +#include "mongo/db/client.h" +#include "mongo/db/feature_compatibility_version_parser.h" +#include "mongo/db/index_builds_coordinator.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/repl/all_database_cloner.h" +#include "mongo/db/repl/initial_sync_state.h" +#include "mongo/db/repl/initial_syncer_common_stats.h" +#include "mongo/db/repl/initial_syncer_factory.h" +#include "mongo/db/repl/initial_syncer_interface.h" +#include "mongo/db/repl/oplog_buffer.h" +#include "mongo/db/repl/oplog_fetcher.h" +#include "mongo/db/repl/optime.h" +#include "mongo/db/repl/repl_server_parameters_gen.h" +#include "mongo/db/repl/replication_consistency_markers.h" +#include "mongo/db/repl/replication_process.h" +#include "mongo/db/repl/storage_interface.h" +#include "mongo/db/repl/sync_source_selector.h" +#include "mongo/db/repl/tenant_migration_access_blocker_util.h" +#include "mongo/db/repl/transaction_oplog_application.h" +#include "mongo/db/serverless/serverless_operation_lock_registry.h" +#include "mongo/db/session/session_txn_record_gen.h" +#include "mongo/executor/task_executor.h" +#include "mongo/logv2/log.h" +#include "mongo/platform/compiler.h" // IWYU pragma: keep +#include "mongo/stdx/mutex.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/destructor_guard.h" +#include "mongo/util/fail_point.h" +#include "mongo/util/scopeguard.h" +#include "mongo/util/str.h" +#include "mongo/util/time_support.h" +#include "mongo/util/timer.h" +#include "mongo/util/version/releases.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kReplicationInitialSync + + +namespace mongo { +namespace repl { + +// Failpoint for initial sync +MONGO_FAIL_POINT_DEFINE(failInitialSyncWithBadHostFCB); + +// Failpoint which fails initial sync and leaves an oplog entry in the buffer. +MONGO_FAIL_POINT_DEFINE(failInitSyncWithBufferedEntriesLeftFCB); + +// Failpoint which causes the initial sync function to hang after getting the oldest active +// transaction timestamp from the sync source. +MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterGettingBeginFetchingTimestampFCB); + +// Failpoint which causes the initial sync function to hang before creating shared data and +// splitting control flow between the oplog fetcher and the cloners. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeSplittingControlFlowFCB); + +// Failpoint which causes the initial sync function to hang before copying databases. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCopyingDatabasesFCB); + +// Failpoint which causes the initial sync function to hang before finishing. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeFinishFCB); + +// Failpoint which causes the initial sync function to hang before creating the oplog. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCreatingOplogFCB); + +// Failpoint which stops the applier. +MONGO_FAIL_POINT_DEFINE(rsSyncApplyStopFCB); + +// Failpoint which causes the initial sync function to hang after cloning all databases. +MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterDataCloningFCB); + +// Failpoint which skips clearing _initialSyncState after a successful initial sync attempt. +MONGO_FAIL_POINT_DEFINE(skipClearInitialSyncStateFCB); + +// Failpoint which causes the initial sync function to fail and hang before starting a new attempt. +MONGO_FAIL_POINT_DEFINE(failAndHangInitialSyncFCB); + +// Failpoint which fails initial sync before it applies the next batch of oplog entries. +MONGO_FAIL_POINT_DEFINE(failInitialSyncBeforeApplyingBatchFCB); + +// Failpoint which fasserts if applying a batch fails. +MONGO_FAIL_POINT_DEFINE(initialSyncFassertIfApplyingBatchFailsFCB); + +// Failpoint which causes the initial sync function to hang before stopping the oplog fetcher. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCompletingOplogFetchingFCB); + +// Failpoint which causes the initial sync function to hang before choosing a sync source. +MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeChoosingSyncSourceFCB); + +// Failpoint which causes the initial sync function to hang after finishing. +MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterFinishFCB); + +// Failpoints for synchronization, shared with cloners. +extern FailPoint initialSyncFuzzerSynchronizationPoint1; +extern FailPoint initialSyncFuzzerSynchronizationPoint2; + +namespace { +using namespace executor; +using CallbackArgs = executor::TaskExecutor::CallbackArgs; +using Event = executor::TaskExecutor::EventHandle; +using Handle = executor::TaskExecutor::CallbackHandle; +using QueryResponseStatus = StatusWith; +using UniqueLock = stdx::unique_lock; +using LockGuard = stdx::lock_guard; + +// Used to reset the oldest timestamp during initial sync to a non-null timestamp. +const Timestamp kTimestampOne(0, 1); + +ServiceContext::UniqueOperationContext makeOpCtx() { + return cc().makeOperationContext(); +} + +StatusWith parseOpTimeAndWallTime(const QueryResponseStatus& fetchResult) { + if (!fetchResult.isOK()) { + return fetchResult.getStatus(); + } + const auto docs = fetchResult.getValue().documents; + const auto hasDoc = docs.begin() != docs.end(); + if (!hasDoc) { + return StatusWith{ErrorCodes::NoMatchingDocument, + "no oplog entry found"}; + } + + return OpTimeAndWallTime::parseOpTimeAndWallTimeFromOplogEntry(docs.front()); +} + +void pauseAtInitialSyncFuzzerSyncronizationPoints(std::string msg) { + // Set and unset by the InitialSyncTest fixture to cause initial sync to pause so that the + // Initial Sync Fuzzer can run commands on the sync source. + if (MONGO_unlikely(initialSyncFuzzerSynchronizationPoint1.shouldFail())) { + LOGV2(21158, + "initialSyncFuzzerSynchronizationPoint1 fail point enabled", + "failpointMessage"_attr = msg); + initialSyncFuzzerSynchronizationPoint1.pauseWhileSet(); + } + + if (MONGO_unlikely(initialSyncFuzzerSynchronizationPoint2.shouldFail())) { + LOGV2(21160, "initialSyncFuzzerSynchronizationPoint2 fail point enabled"); + initialSyncFuzzerSynchronizationPoint2.pauseWhileSet(); + } +} + +} // namespace + +const ServiceContext::ConstructorActionRegisterer initialSyncerRegistererFCB( + "InitialSyncerRegistererFCB", + {"InitialSyncerFactoryRegisterer"} /* dependency list */, + [](ServiceContext* service) { + InitialSyncerFactory::get(service)->registerInitialSyncer( + "fileCopyBased", + [](InitialSyncerInterface::Options opts, + std::unique_ptr dataReplicatorExternalState, + ThreadPool* writerPool, + StorageInterface* storage, + ReplicationProcess* replicationProcess, + const InitialSyncerInterface::OnCompletionFn& onCompletion) { + return std::make_shared(opts, + std::move(dataReplicatorExternalState), + writerPool, + storage, + replicationProcess, + onCompletion); + }); + }); + +InitialSyncerFCB::InitialSyncerFCB( + InitialSyncerInterface::Options opts, + std::unique_ptr dataReplicatorExternalState, + ThreadPool* writerPool, + StorageInterface* storage, + ReplicationProcess* replicationProcess, + const OnCompletionFn& onCompletion) + : _fetchCount(0), + _opts(opts), + _dataReplicatorExternalState(std::move(dataReplicatorExternalState)), + _exec(_dataReplicatorExternalState->getSharedTaskExecutor()), + _clonerExec(_exec), + _writerPool(writerPool), + _storage(storage), + _replicationProcess(replicationProcess), + _onCompletion(onCompletion), + _createClientFn( + [] { return std::make_unique(true /* autoReconnect */); }), + _createOplogFetcherFn(CreateOplogFetcherFn::get()) { + uassert(ErrorCodes::BadValue, "task executor cannot be null", _exec); + uassert(ErrorCodes::BadValue, "invalid storage interface", _storage); + uassert(ErrorCodes::BadValue, "invalid replication process", _replicationProcess); + uassert(ErrorCodes::BadValue, "invalid getMyLastOptime function", _opts.getMyLastOptime); + uassert(ErrorCodes::BadValue, "invalid setMyLastOptime function", _opts.setMyLastOptime); + uassert(ErrorCodes::BadValue, "invalid resetOptimes function", _opts.resetOptimes); + uassert(ErrorCodes::BadValue, "invalid sync source selector", _opts.syncSourceSelector); + uassert(ErrorCodes::BadValue, "callback function cannot be null", _onCompletion); +} + +InitialSyncerFCB::~InitialSyncerFCB() { + DESTRUCTOR_GUARD({ + shutdown().transitional_ignore(); + join(); + }); +} + +bool InitialSyncerFCB::isActive() const { + stdx::lock_guard lock(_mutex); + return _isActive_inlock(); +} + +bool InitialSyncerFCB::_isActive_inlock() const { + return State::kRunning == _state || State::kShuttingDown == _state; +} + +std::string InitialSyncerFCB::getInitialSyncMethod() const { + return "logical"; +} + +Status InitialSyncerFCB::startup(OperationContext* opCtx, + std::uint32_t initialSyncMaxAttempts) noexcept { + invariant(opCtx); + invariant(initialSyncMaxAttempts >= 1U); + + stdx::lock_guard lock(_mutex); + switch (_state) { + case State::kPreStart: + _state = State::kRunning; + break; + case State::kRunning: + return {ErrorCodes::IllegalOperation, "initial syncer already started"}; + case State::kShuttingDown: + return {ErrorCodes::ShutdownInProgress, "initial syncer shutting down"}; + case State::kComplete: + return {ErrorCodes::ShutdownInProgress, "initial syncer completed"}; + } + + _setUp_inlock(opCtx, initialSyncMaxAttempts); + + // Start first initial sync attempt. + std::uint32_t initialSyncAttempt = 0; + _attemptExec = std::make_unique( + _exec, Status(ErrorCodes::CallbackCanceled, "Initial Sync Attempt Canceled")); + _clonerAttemptExec = std::make_unique( + _clonerExec, Status(ErrorCodes::CallbackCanceled, "Initial Sync Attempt Canceled")); + auto status = _scheduleWorkAndSaveHandle_inlock( + [=](const executor::TaskExecutor::CallbackArgs& args) { + _startInitialSyncAttemptCallback(args, initialSyncAttempt, initialSyncMaxAttempts); + }, + &_startInitialSyncAttemptHandle, + str::stream() << "_startInitialSyncAttemptCallback-" << initialSyncAttempt); + + if (!status.isOK()) { + _state = State::kComplete; + return status; + } + + return Status::OK(); +} + +Status InitialSyncerFCB::shutdown() { + stdx::lock_guard lock(_mutex); + switch (_state) { + case State::kPreStart: + // Transition directly from PreStart to Complete if not started yet. + _state = State::kComplete; + return Status::OK(); + case State::kRunning: + _state = State::kShuttingDown; + break; + case State::kShuttingDown: + case State::kComplete: + // Nothing to do if we are already in ShuttingDown or Complete state. + return Status::OK(); + } + + _cancelRemainingWork_inlock(); + + return Status::OK(); +} + +void InitialSyncerFCB::cancelCurrentAttempt() { + stdx::lock_guard lk(_mutex); + if (_isActive_inlock()) { + LOGV2_DEBUG(4427201, + 1, + "Cancelling the current initial sync attempt.", + "currentAttempt"_attr = _stats.failedInitialSyncAttempts + 1); + _cancelRemainingWork_inlock(); + } else { + LOGV2_DEBUG(4427202, + 1, + "There is no initial sync attempt to cancel because the initial syncer is not " + "currently active."); + } +} + +void InitialSyncerFCB::_cancelRemainingWork_inlock() { + _cancelHandle_inlock(_startInitialSyncAttemptHandle); + _cancelHandle_inlock(_chooseSyncSourceHandle); + _cancelHandle_inlock(_getBaseRollbackIdHandle); + _cancelHandle_inlock(_getLastRollbackIdHandle); + _cancelHandle_inlock(_getNextApplierBatchHandle); + + _shutdownComponent_inlock(_oplogFetcher); + if (_sharedData) { + // We actually hold the required lock, but the lock object itself is not passed through. + _clearRetriableError(WithLock::withoutLock()); + stdx::lock_guard lock(*_sharedData); + _sharedData->setStatusIfOK( + lock, Status{ErrorCodes::CallbackCanceled, "Initial sync attempt canceled"}); + } + if (_client) { + _client->shutdownAndDisallowReconnect(); + } + _shutdownComponent_inlock(_applier); + _shutdownComponent_inlock(_fCVFetcher); + _shutdownComponent_inlock(_lastOplogEntryFetcher); + _shutdownComponent_inlock(_beginFetchingOpTimeFetcher); + (*_attemptExec)->shutdown(); + (*_clonerAttemptExec)->shutdown(); + _attemptCanceled = true; +} + +void InitialSyncerFCB::join() { + stdx::unique_lock lk(_mutex); + _stateCondition.wait(lk, [this]() { return !_isActive_inlock(); }); +} + +InitialSyncerFCB::State InitialSyncerFCB::getState_forTest() const { + stdx::lock_guard lk(_mutex); + return _state; +} + +Date_t InitialSyncerFCB::getWallClockTime_forTest() const { + stdx::lock_guard lk(_mutex); + return _lastApplied.wallTime; +} + +void InitialSyncerFCB::setAllowedOutageDuration_forTest(Milliseconds allowedOutageDuration) { + stdx::lock_guard lk(_mutex); + _allowedOutageDuration = allowedOutageDuration; + if (_sharedData) { + stdx::lock_guard lk(*_sharedData); + _sharedData->setAllowedOutageDuration_forTest(lk, allowedOutageDuration); + } +} + +bool InitialSyncerFCB::_isShuttingDown() const { + stdx::lock_guard lock(_mutex); + return _isShuttingDown_inlock(); +} + +bool InitialSyncerFCB::_isShuttingDown_inlock() const { + return State::kShuttingDown == _state; +} + +std::string InitialSyncerFCB::getDiagnosticString() const { + LockGuard lk(_mutex); + str::stream out; + out << "InitialSyncerFCB -" << " oplogFetcher: " << _oplogFetcher->toString() + << " opsBuffered: " << _oplogBuffer->getSize() << " active: " << _isActive_inlock() + << " shutting down: " << _isShuttingDown_inlock(); + if (_initialSyncState) { + out << " opsAppied: " << _initialSyncState->appliedOps; + } + + return out; +} + +BSONObj InitialSyncerFCB::getInitialSyncProgress() const { + LockGuard lk(_mutex); + + // We return an empty BSON object after an initial sync attempt has been successfully + // completed. When an initial sync attempt completes successfully, initialSyncCompletes is + // incremented and then _initialSyncState is cleared. We check that _initialSyncState has been + // cleared because an initial sync attempt can fail even after initialSyncCompletes is + // incremented, and we also check that initialSyncCompletes is positive because an initial sync + // attempt can also fail before _initialSyncState is initialized. + if (!_initialSyncState && initial_sync_common_stats::initialSyncCompletes.get() > 0) { + return {}; + } + return _getInitialSyncProgress_inlock(); +} + +void InitialSyncerFCB::_appendInitialSyncProgressMinimal_inlock(BSONObjBuilder* bob) const { + bob->append("method", "logical"); + _stats.append(bob); + if (!_initialSyncState) { + return; + } + if (_initialSyncState->allDatabaseCloner) { + const auto allDbClonerStats = _initialSyncState->allDatabaseCloner->getStats(); + const auto approxTotalDataSize = allDbClonerStats.dataSize; + bob->appendNumber("approxTotalDataSize", approxTotalDataSize); + long long approxTotalBytesCopied = 0; + for (auto const& dbClonerStats : allDbClonerStats.databaseStats) { + for (auto const& collClonerStats : dbClonerStats.collectionStats) { + approxTotalBytesCopied += collClonerStats.approxBytesCopied; + } + } + bob->appendNumber("approxTotalBytesCopied", approxTotalBytesCopied); + if (approxTotalBytesCopied > 0) { + const auto statsObj = bob->asTempObj(); + auto totalInitialSyncElapsedMillis = + statsObj.getField("totalInitialSyncElapsedMillis").safeNumberLong(); + const auto downloadRate = + (double)totalInitialSyncElapsedMillis / (double)approxTotalBytesCopied; + const auto remainingInitialSyncEstimatedMillis = + downloadRate * (double)(approxTotalDataSize - approxTotalBytesCopied); + bob->appendNumber("remainingInitialSyncEstimatedMillis", + (long long)remainingInitialSyncEstimatedMillis); + } + } + bob->appendNumber("appliedOps", static_cast(_initialSyncState->appliedOps)); + if (!_initialSyncState->beginApplyingTimestamp.isNull()) { + bob->append("initialSyncOplogStart", _initialSyncState->beginApplyingTimestamp); + } + // Only include the beginFetchingTimestamp if it's different from the beginApplyingTimestamp. + if (!_initialSyncState->beginFetchingTimestamp.isNull() && + _initialSyncState->beginFetchingTimestamp != _initialSyncState->beginApplyingTimestamp) { + bob->append("initialSyncOplogFetchingStart", _initialSyncState->beginFetchingTimestamp); + } + if (!_initialSyncState->stopTimestamp.isNull()) { + bob->append("initialSyncOplogEnd", _initialSyncState->stopTimestamp); + } + if (_sharedData) { + stdx::lock_guard sdLock(*_sharedData); + auto unreachableSince = _sharedData->getSyncSourceUnreachableSince(sdLock); + if (unreachableSince != Date_t()) { + bob->append("syncSourceUnreachableSince", unreachableSince); + bob->append("currentOutageDurationMillis", + durationCount(_sharedData->getCurrentOutageDuration(sdLock))); + } + bob->append("totalTimeUnreachableMillis", + durationCount(_sharedData->getTotalTimeUnreachable(sdLock))); + } +} + +BSONObj InitialSyncerFCB::_getInitialSyncProgress_inlock() const { + try { + BSONObjBuilder bob; + _appendInitialSyncProgressMinimal_inlock(&bob); + if (_initialSyncState) { + if (_initialSyncState->allDatabaseCloner) { + BSONObjBuilder dbsBuilder(bob.subobjStart("databases")); + _initialSyncState->allDatabaseCloner->getStats().append(&dbsBuilder); + dbsBuilder.doneFast(); + } + } + return bob.obj(); + } catch (const DBException& e) { + LOGV2(21161, + "Error creating initial sync progress object: {error}", + "Error creating initial sync progress object", + "error"_attr = e.toString()); + } + BSONObjBuilder bob; + _appendInitialSyncProgressMinimal_inlock(&bob); + return bob.obj(); +} + +void InitialSyncerFCB::setCreateClientFn_forTest(const CreateClientFn& createClientFn) { + LockGuard lk(_mutex); + _createClientFn = createClientFn; +} + +void InitialSyncerFCB::setCreateOplogFetcherFn_forTest( + std::unique_ptr createOplogFetcherFn) { + LockGuard lk(_mutex); + _createOplogFetcherFn = std::move(createOplogFetcherFn); +} + +OplogFetcher* InitialSyncerFCB::getOplogFetcher_forTest() const { + // Wait up to 10 seconds. + for (auto i = 0; i < 100; i++) { + { + LockGuard lk(_mutex); + if (_oplogFetcher) { + return _oplogFetcher.get(); + } + } + sleepmillis(100); + } + invariant(false, "Timed out getting OplogFetcher pointer for test"); + return nullptr; +} + +void InitialSyncerFCB::setClonerExecutor_forTest( + std::shared_ptr clonerExec) { + _clonerExec = std::move(clonerExec); +} + +void InitialSyncerFCB::waitForCloner_forTest() { + _initialSyncState->allDatabaseClonerFuture.wait(); +} + +void InitialSyncerFCB::_setUp_inlock(OperationContext* opCtx, + std::uint32_t initialSyncMaxAttempts) { + // 'opCtx' is passed through from startup(). + _replicationProcess->getConsistencyMarkers()->setInitialSyncFlag(opCtx); + _replicationProcess->getConsistencyMarkers()->clearInitialSyncId(opCtx); + + auto* serviceCtx = opCtx->getServiceContext(); + _storage->setInitialDataTimestamp(serviceCtx, Timestamp::kAllowUnstableCheckpointsSentinel); + _storage->setStableTimestamp(serviceCtx, Timestamp::min()); + + LOGV2_DEBUG(21162, 1, "Creating oplogBuffer"); + _oplogBuffer = _dataReplicatorExternalState->makeInitialSyncOplogBuffer(opCtx); + _oplogBuffer->startup(opCtx); + + _stats.initialSyncStart = _exec->now(); + _stats.maxFailedInitialSyncAttempts = initialSyncMaxAttempts; + _stats.failedInitialSyncAttempts = 0; + _stats.exec = std::weak_ptr(_exec); + + _allowedOutageDuration = Seconds(initialSyncTransientErrorRetryPeriodSeconds.load()); +} + +void InitialSyncerFCB::_tearDown_inlock(OperationContext* opCtx, + const StatusWith& lastApplied) { + _stats.initialSyncEnd = _exec->now(); + + // This might not be necessary if we failed initial sync. + invariant(_oplogBuffer); + _oplogBuffer->shutdown(opCtx); + + if (!lastApplied.isOK()) { + return; + } + const auto lastAppliedOpTime = lastApplied.getValue().opTime; + auto initialDataTimestamp = lastAppliedOpTime.getTimestamp(); + + // A node coming out of initial sync must guarantee at least one oplog document is visible + // such that others can sync from this node. Oplog visibility is only advanced when applying + // oplog entries during initial sync. Correct the visibility to match the initial sync time + // before transitioning to steady state replication. + const bool orderedCommit = true; + _storage->oplogDiskLocRegister(opCtx, initialDataTimestamp, orderedCommit); + + tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx); + ServerlessOperationLockRegistry::recoverLocks(opCtx); + reconstructPreparedTransactions(opCtx, repl::OplogApplication::Mode::kInitialSync); + + _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx); + + // We set the initial data timestamp before clearing the initial sync flag. See comments in + // clearInitialSyncFlag. + _storage->setInitialDataTimestamp(opCtx->getServiceContext(), initialDataTimestamp); + + _replicationProcess->getConsistencyMarkers()->clearInitialSyncFlag(opCtx); + + auto currentLastAppliedOpTime = _opts.getMyLastOptime(); + if (currentLastAppliedOpTime.isNull()) { + _opts.setMyLastOptime(lastApplied.getValue()); + } else { + invariant(currentLastAppliedOpTime == lastAppliedOpTime); + } + + LOGV2(21163, + "initial sync done; took " + "{duration}.", + "Initial sync done", + "duration"_attr = + duration_cast(_stats.initialSyncEnd - _stats.initialSyncStart)); + initial_sync_common_stats::initialSyncCompletes.increment(); +} + +void InitialSyncerFCB::_startInitialSyncAttemptCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::uint32_t initialSyncAttempt, + std::uint32_t initialSyncMaxAttempts) noexcept { + auto status = [&] { + stdx::lock_guard lock(_mutex); + return _checkForShutdownAndConvertStatus_inlock( + callbackArgs, + str::stream() << "error while starting initial sync attempt " + << (initialSyncAttempt + 1) << " of " << initialSyncMaxAttempts); + }(); + + if (!status.isOK()) { + _finishInitialSyncAttempt(status); + return; + } + + LOGV2(21164, + "Starting initial sync (attempt {initialSyncAttempt} of {initialSyncMaxAttempts})", + "Starting initial sync attempt", + "initialSyncAttempt"_attr = (initialSyncAttempt + 1), + "initialSyncMaxAttempts"_attr = initialSyncMaxAttempts); + + // This completion guard invokes _finishInitialSyncAttempt on destruction. + auto cancelRemainingWorkInLock = [this]() { + _cancelRemainingWork_inlock(); + }; + auto finishInitialSyncAttemptFn = [this](const StatusWith& lastApplied) { + _finishInitialSyncAttempt(lastApplied); + }; + auto onCompletionGuard = + std::make_shared(cancelRemainingWorkInLock, finishInitialSyncAttemptFn); + + // Lock guard must be declared after completion guard because completion guard destructor + // has to run outside lock. + stdx::lock_guard lock(_mutex); + + _oplogApplier = {}; + + LOGV2_DEBUG( + 21165, 2, "Resetting sync source so a new one can be chosen for this initial sync attempt"); + _syncSource = HostAndPort(); + + LOGV2_DEBUG(21166, 2, "Resetting all optimes before starting this initial sync attempt"); + _opts.resetOptimes(); + _lastApplied = {OpTime(), Date_t()}; + _lastFetched = {}; + + LOGV2_DEBUG( + 21167, 2, "Resetting the oldest timestamp before starting this initial sync attempt"); + auto* storageEngine = getGlobalServiceContext()->getStorageEngine(); + if (storageEngine) { + // Set the oldestTimestamp to one because WiredTiger does not allow us to set it to zero + // since that would also set the all_durable point to zero. We specifically don't set + // the stable timestamp here because that will trigger taking a first stable checkpoint even + // though the initialDataTimestamp is still set to kAllowUnstableCheckpointsSentinel. + storageEngine->setOldestTimestamp(kTimestampOne, true /*force*/); + } + + LOGV2_DEBUG(21168, + 2, + "Resetting feature compatibility version to last-lts. If the sync source is in " + "latest feature compatibility version, we will find out when we clone the " + "server configuration collection (admin.system.version)"); + serverGlobalParams.mutableFCV.reset(); + + // Clear the oplog buffer. + _oplogBuffer->clear(makeOpCtx().get()); + + // Get sync source. + std::uint32_t chooseSyncSourceAttempt = 0; + std::uint32_t chooseSyncSourceMaxAttempts = + static_cast(numInitialSyncConnectAttempts.load()); + + // _scheduleWorkAndSaveHandle_inlock() is shutdown-aware. + status = _scheduleWorkAndSaveHandle_inlock( + [=](const executor::TaskExecutor::CallbackArgs& args) { + _chooseSyncSourceCallback( + args, chooseSyncSourceAttempt, chooseSyncSourceMaxAttempts, onCompletionGuard); + }, + &_chooseSyncSourceHandle, + str::stream() << "_chooseSyncSourceCallback-" << chooseSyncSourceAttempt); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +void InitialSyncerFCB::_chooseSyncSourceCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::uint32_t chooseSyncSourceAttempt, + std::uint32_t chooseSyncSourceMaxAttempts, + std::shared_ptr onCompletionGuard) noexcept try { + if (MONGO_unlikely(initialSyncHangBeforeChoosingSyncSourceFCB.shouldFail())) { + LOGV2(5284800, "initialSyncHangBeforeChoosingSyncSourceFCB fail point enabled"); + initialSyncHangBeforeChoosingSyncSourceFCB.pauseWhileSet(); + } + + stdx::unique_lock lock(_mutex); + // Cancellation should be treated the same as other errors. In this case, the most likely cause + // of a failed _chooseSyncSourceCallback() task is a cancellation triggered by + // InitialSyncerFCB::shutdown() or the task executor shutting down. + auto status = + _checkForShutdownAndConvertStatus_inlock(callbackArgs, "error while choosing sync source"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + if (MONGO_unlikely(failInitialSyncWithBadHostFCB.shouldFail())) { + status = Status(ErrorCodes::InvalidSyncSource, + "initial sync failed - failInitialSyncWithBadHostFCB failpoint is set."); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto syncSource = _chooseSyncSource_inlock(); + if (!syncSource.isOK()) { + if (chooseSyncSourceAttempt + 1 >= chooseSyncSourceMaxAttempts) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::InitialSyncOplogSourceMissing, + "No valid sync source found in current replica set to do an initial sync.")); + return; + } + + auto when = (*_attemptExec)->now() + _opts.syncSourceRetryWait; + LOGV2_DEBUG(21169, + 1, + "Error getting sync source: '{error}', trying again in " + "{syncSourceRetryWait} at {retryTime}. Attempt {chooseSyncSourceAttempt} of " + "{numInitialSyncConnectAttempts}", + "Error getting sync source. Waiting to retry", + "error"_attr = syncSource.getStatus(), + "syncSourceRetryWait"_attr = _opts.syncSourceRetryWait, + "retryTime"_attr = when.toString(), + "chooseSyncSourceAttempt"_attr = (chooseSyncSourceAttempt + 1), + "numInitialSyncConnectAttempts"_attr = numInitialSyncConnectAttempts.load()); + auto status = _scheduleWorkAtAndSaveHandle_inlock( + when, + [=](const executor::TaskExecutor::CallbackArgs& args) { + _chooseSyncSourceCallback(args, + chooseSyncSourceAttempt + 1, + chooseSyncSourceMaxAttempts, + onCompletionGuard); + }, + &_chooseSyncSourceHandle, + str::stream() << "_chooseSyncSourceCallback-" << (chooseSyncSourceAttempt + 1)); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + return; + } + + if (MONGO_unlikely(initialSyncHangBeforeCreatingOplogFCB.shouldFail())) { + // This log output is used in js tests so please leave it. + LOGV2(21170, + "initial sync - initialSyncHangBeforeCreatingOplogFCB fail point " + "enabled. Blocking until fail point is disabled."); + lock.unlock(); + while (MONGO_unlikely(initialSyncHangBeforeCreatingOplogFCB.shouldFail()) && + !_isShuttingDown()) { + mongo::sleepsecs(1); + } + lock.lock(); + } + + // There is no need to schedule separate task to create oplog collection since we are already in + // a callback and we are certain there's no existing operation context (required for creating + // collections and dropping user databases) attached to the current thread. + status = _truncateOplogAndDropReplicatedDatabases(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + _syncSource = syncSource.getValue(); + + // Schedule rollback ID checker. + _rollbackChecker = std::make_unique(*_attemptExec, _syncSource); + auto scheduleResult = _rollbackChecker->reset([=](const RollbackChecker::Result& result) { + return _rollbackCheckerResetCallback(result, onCompletionGuard); + }); + status = scheduleResult.getStatus(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + _getBaseRollbackIdHandle = scheduleResult.getValue(); +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { + // truncate oplog; drop user databases. + LOGV2_DEBUG(4540700, + 1, + "About to truncate the oplog, if it exists, ns:{namespace}, and drop all " + "user databases (so that we can clone them).", + "About to truncate the oplog, if it exists, and drop all user databases (so that " + "we can clone them)", + logAttrs(NamespaceString::kRsOplogNamespace)); + + auto opCtx = makeOpCtx(); + // This code can make untimestamped writes (deletes) to the _mdb_catalog on top of existing + // timestamped updates. + opCtx->recoveryUnit()->allowAllUntimestampedWrites(); + + // We are not replicating nor validating these writes. + UnreplicatedWritesBlock unreplicatedWritesBlock(opCtx.get()); + + // 1.) Truncate the oplog. + LOGV2_DEBUG(4540701, + 2, + "Truncating the existing oplog: {namespace}", + "Truncating the existing oplog", + logAttrs(NamespaceString::kRsOplogNamespace)); + Timer timer; + auto status = _storage->truncateCollection(opCtx.get(), NamespaceString::kRsOplogNamespace); + LOGV2(21173, + "Initial syncer oplog truncation finished in: {durationMillis}ms", + "Initial syncer oplog truncation finished", + "durationMillis"_attr = timer.millis()); + if (!status.isOK()) { + // 1a.) Create the oplog. + LOGV2_DEBUG(4540702, + 2, + "Creating the oplog: {namespace}", + "Creating the oplog", + logAttrs(NamespaceString::kRsOplogNamespace)); + status = _storage->createOplog(opCtx.get(), NamespaceString::kRsOplogNamespace); + if (!status.isOK()) { + return status; + } + } + + // 2a.) Abort any index builds started during initial sync. + IndexBuildsCoordinator::get(opCtx.get()) + ->abortAllIndexBuildsForInitialSync(opCtx.get(), "Aborting index builds for initial sync"); + + // 2b.) Drop user databases. + LOGV2_DEBUG(21175, 2, "Dropping user databases"); + return _storage->dropReplicatedDatabases(opCtx.get()); +} + +void InitialSyncerFCB::_rollbackCheckerResetCallback( + const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard) { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), + "error while getting base rollback ID"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // Since the beginFetchingOpTime is retrieved before significant work is done copying + // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy + // which retries up to 'numInitialSyncOplogFindAttempts' times'. This will fail relatively + // quickly in the presence of network errors, allowing us to choose a different sync source. + status = _scheduleLastOplogEntryFetcher_inlock( + [=](const StatusWith& response, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) mutable { + _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime(response, + onCompletionGuard); + }, + kFetcherHandlesRetries); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime( + const StatusWith& result, + std::shared_ptr onCompletionGuard) { + + stdx::unique_lock lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + result.getStatus(), "error while getting last oplog entry for begin timestamp"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto opTimeResult = parseOpTimeAndWallTime(result); + status = opTimeResult.getStatus(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // This is the top of the oplog before we query for the oldest active transaction timestamp. If + // that query returns that there are no active transactions, we will use this as the + // beginFetchingTimestamp. + const auto& defaultBeginFetchingOpTime = opTimeResult.getValue().opTime; + + std::string logMsg = str::stream() << "Initial Syncer got the defaultBeginFetchingTimestamp: " + << defaultBeginFetchingOpTime.toString(); + pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); + LOGV2_DEBUG(6608900, + 1, + "Initial Syncer got the defaultBeginFetchingOpTime", + "defaultBeginFetchingOpTime"_attr = defaultBeginFetchingOpTime); + + status = _scheduleGetBeginFetchingOpTime_inlock(onCompletionGuard, defaultBeginFetchingOpTime); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +Status InitialSyncerFCB::_scheduleGetBeginFetchingOpTime_inlock( + std::shared_ptr onCompletionGuard, + const OpTime& defaultBeginFetchingOpTime) { + + const auto preparedState = DurableTxnState_serializer(DurableTxnStateEnum::kPrepared); + const auto inProgressState = DurableTxnState_serializer(DurableTxnStateEnum::kInProgress); + + // Obtain the oldest active transaction timestamp from the remote by querying their transactions + // table. To prevent oplog holes (primary) or a stale lastAppliedSnapshot (secondary) from + // causing this query to return an inaccurate timestamp, we specify an afterClusterTime of the + // defaultBeginFetchingOpTime so that we wait for all previous writes to be visible. + BSONObjBuilder cmd; + cmd.append("find", NamespaceString::kSessionTransactionsTableNamespace.coll().toString()); + cmd.append("filter", + BSON("state" << BSON("$in" << BSON_ARRAY(preparedState << inProgressState)))); + cmd.append("sort", BSON(SessionTxnRecord::kStartOpTimeFieldName << 1)); + cmd.append("readConcern", + BSON("level" + << "local" + << "afterClusterTime" << defaultBeginFetchingOpTime.getTimestamp())); + cmd.append("limit", 1); + + _beginFetchingOpTimeFetcher = std::make_unique( + *_attemptExec, + _syncSource, + NamespaceString::kSessionTransactionsTableNamespace.db().toString(), + cmd.obj(), + [=](const StatusWith& response, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) mutable { + _getBeginFetchingOpTimeCallback( + response, onCompletionGuard, defaultBeginFetchingOpTime); + }, + ReadPreferenceSetting::secondaryPreferredMetadata(), + RemoteCommandRequest::kNoTimeout /* find network timeout */, + RemoteCommandRequest::kNoTimeout /* getMore network timeout */, + RemoteCommandRetryScheduler::makeRetryPolicy( + numInitialSyncOplogFindAttempts.load(), executor::RemoteCommandRequest::kNoTimeout)); + Status scheduleStatus = _beginFetchingOpTimeFetcher->schedule(); + if (!scheduleStatus.isOK()) { + _beginFetchingOpTimeFetcher.reset(); + } + return scheduleStatus; +} + +void InitialSyncerFCB::_getBeginFetchingOpTimeCallback( + const StatusWith& result, + std::shared_ptr onCompletionGuard, + const OpTime& defaultBeginFetchingOpTime) { + stdx::unique_lock lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + result.getStatus(), + "error while getting oldest active transaction timestamp for begin fetching timestamp"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto docs = result.getValue().documents; + if (docs.size() > 1) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::TooManyMatchingDocuments, + str::stream() << "Expected to receive one document for the oldest active " + "transaction entry, but received: " + << docs.size() << ". First: " << redact(docs.front()) + << ". Last: " << redact(docs.back()))); + return; + } + + // Set beginFetchingOpTime if the oldest active transaction timestamp actually exists. Otherwise + // use the sync source's top of the oplog from before querying for the oldest active transaction + // timestamp. This will mean that even if a transaction is started on the sync source after + // querying for the oldest active transaction timestamp, the node will still fetch its oplog + // entries. + OpTime beginFetchingOpTime = defaultBeginFetchingOpTime; + if (!docs.empty()) { + auto entry = SessionTxnRecord::parse( + IDLParserContext("oldest active transaction optime for initial sync"), docs.front()); + auto optime = entry.getStartOpTime(); + if (optime) { + beginFetchingOpTime = optime.value(); + } + } + + std::string logMsg = str::stream() + << "Initial Syncer got the beginFetchingTimestamp: " << beginFetchingOpTime.toString(); + pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); + + if (MONGO_unlikely(initialSyncHangAfterGettingBeginFetchingTimestampFCB.shouldFail())) { + LOGV2(21176, "initialSyncHangAfterGettingBeginFetchingTimestampFCB fail point enabled"); + initialSyncHangAfterGettingBeginFetchingTimestampFCB.pauseWhileSet(); + } + + // Since the beginFetchingOpTime is retrieved before significant work is done copying + // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy + // which retries up to 'numInitialSyncOplogFindAttempts' times'. This will fail relatively + // quickly in the presence of network errors, allowing us to choose a different sync source. + status = _scheduleLastOplogEntryFetcher_inlock( + [=](const StatusWith& response, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) mutable { + _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( + response, onCompletionGuard, beginFetchingOpTime); + }, + kFetcherHandlesRetries); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( + const StatusWith& result, + std::shared_ptr onCompletionGuard, + OpTime& beginFetchingOpTime) { + stdx::unique_lock lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + result.getStatus(), "error while getting last oplog entry for begin timestamp"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto opTimeResult = parseOpTimeAndWallTime(result); + status = opTimeResult.getStatus(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto& lastOpTime = opTimeResult.getValue().opTime; + + std::string logMsg = str::stream() + << "Initial Syncer got the beginApplyingTimestamp: " << lastOpTime.toString(); + pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); + + BSONObjBuilder queryBob; + queryBob.append("find", NamespaceString::kServerConfigurationNamespace.coll()); + auto filterBob = BSONObjBuilder(queryBob.subobjStart("filter")); + filterBob.append("_id", multiversion::kParameterName); + filterBob.done(); + // As part of reading the FCV, we ensure the source node's all_durable timestamp has advanced + // to at least the timestamp of the last optime that we found in the lastOplogEntryFetcher. + // When document locking is used, there could be oplog "holes" which would result in + // inconsistent initial sync data if we didn't do this. + auto readConcernBob = BSONObjBuilder(queryBob.subobjStart("readConcern")); + readConcernBob.append("afterClusterTime", lastOpTime.getTimestamp()); + readConcernBob.done(); + + _fCVFetcher = std::make_unique( + *_attemptExec, + _syncSource, + NamespaceString::kServerConfigurationNamespace.db().toString(), + queryBob.obj(), + [=](const StatusWith& response, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) mutable { + _fcvFetcherCallback(response, onCompletionGuard, lastOpTime, beginFetchingOpTime); + }, + ReadPreferenceSetting::secondaryPreferredMetadata(), + RemoteCommandRequest::kNoTimeout /* find network timeout */, + RemoteCommandRequest::kNoTimeout /* getMore network timeout */, + RemoteCommandRetryScheduler::makeRetryPolicy( + numInitialSyncOplogFindAttempts.load(), executor::RemoteCommandRequest::kNoTimeout)); + Status scheduleStatus = _fCVFetcher->schedule(); + if (!scheduleStatus.isOK()) { + _fCVFetcher.reset(); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, scheduleStatus); + return; + } +} + +void InitialSyncerFCB::_fcvFetcherCallback(const StatusWith& result, + std::shared_ptr onCompletionGuard, + const OpTime& lastOpTime, + OpTime& beginFetchingOpTime) { + stdx::unique_lock lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + result.getStatus(), "error while getting the remote feature compatibility version"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto docs = result.getValue().documents; + if (docs.size() > 1) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::TooManyMatchingDocuments, + str::stream() << "Expected to receive one feature compatibility version " + "document, but received: " + << docs.size() << ". First: " << redact(docs.front()) + << ". Last: " << redact(docs.back()))); + return; + } + const auto hasDoc = docs.begin() != docs.end(); + if (!hasDoc) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::IncompatibleServerVersion, + "Sync source had no feature compatibility version document")); + return; + } + + auto fCVParseSW = FeatureCompatibilityVersionParser::parse(docs.front()); + if (!fCVParseSW.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, fCVParseSW.getStatus()); + return; + } + + auto version = fCVParseSW.getValue(); + + // Changing the featureCompatibilityVersion during initial sync is unsafe. + // (Generic FCV reference): This FCV check should exist across LTS binary versions. + if (serverGlobalParams.featureCompatibility.acquireFCVSnapshot().isUpgradingOrDowngrading( + version)) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::IncompatibleServerVersion, + str::stream() << "Sync source had unsafe feature compatibility version: " + << multiversion::toString(version))); + return; + } else { + // Since we don't guarantee that we always clone the "admin.system.version" collection first + // and collection/index creation can depend on FCV, we set the in-memory FCV value to match + // the version on the sync source. We won't persist the FCV on disk nor will we update our + // minWireVersion until we clone the actual document. + serverGlobalParams.mutableFCV.setVersion(version); + } + + if (MONGO_unlikely(initialSyncHangBeforeSplittingControlFlowFCB.shouldFail())) { + lock.unlock(); + LOGV2(5032000, + "initial sync - initialSyncHangBeforeSplittingControlFlowFCB fail point " + "enabled. Blocking until fail point is disabled."); + while (MONGO_unlikely(initialSyncHangBeforeSplittingControlFlowFCB.shouldFail()) && + !_isShuttingDown()) { + mongo::sleepsecs(1); + } + lock.lock(); + } + + // This is where the flow of control starts to split into two parallel tracks: + // - oplog fetcher + // - data cloning and applier + _sharedData = + std::make_unique(_rollbackChecker->getBaseRBID(), + _allowedOutageDuration, + getGlobalServiceContext()->getFastClockSource()); + _client = _createClientFn(); + _initialSyncState = std::make_unique(std::make_unique( + _sharedData.get(), _syncSource, _client.get(), _storage, _writerPool)); + + // Create oplog applier. + auto* consistencyMarkers = _replicationProcess->getConsistencyMarkers(); + OplogApplier::Options options(OplogApplication::Mode::kInitialSync); + options.beginApplyingOpTime = lastOpTime; + _oplogApplier = _dataReplicatorExternalState->makeOplogApplier(_oplogBuffer.get(), + &noopOplogApplierObserver, + consistencyMarkers, + _storage, + options, + _writerPool); + + _initialSyncState->beginApplyingTimestamp = lastOpTime.getTimestamp(); + _initialSyncState->beginFetchingTimestamp = beginFetchingOpTime.getTimestamp(); + + invariant(_initialSyncState->beginApplyingTimestamp >= + _initialSyncState->beginFetchingTimestamp, + str::stream() << "beginApplyingTimestamp was less than beginFetchingTimestamp. " + "beginApplyingTimestamp: " + << _initialSyncState->beginApplyingTimestamp.toBSON() + << " beginFetchingTimestamp: " + << _initialSyncState->beginFetchingTimestamp.toBSON()); + + invariant(!result.getValue().documents.empty()); + LOGV2_DEBUG(4431600, + 2, + "Setting begin applying timestamp to {beginApplyingTimestamp}, ns: " + "{namespace} and the begin fetching timestamp to {beginFetchingTimestamp}", + "Setting begin applying timestamp and begin fetching timestamp", + "beginApplyingTimestamp"_attr = _initialSyncState->beginApplyingTimestamp, + logAttrs(NamespaceString::kRsOplogNamespace), + "beginFetchingTimestamp"_attr = _initialSyncState->beginFetchingTimestamp); + + const auto configResult = _dataReplicatorExternalState->getCurrentConfig(); + status = configResult.getStatus(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + _initialSyncState.reset(); + return; + } + + const auto& config = configResult.getValue(); + OplogFetcher::Config oplogFetcherConfig( + beginFetchingOpTime, + _syncSource, + config, + _rollbackChecker->getBaseRBID(), + initialSyncOplogFetcherBatchSize, + OplogFetcher::RequireFresherSyncSource::kDontRequireFresherSyncSource); + oplogFetcherConfig.startingPoint = OplogFetcher::StartingPoint::kEnqueueFirstDoc; + _oplogFetcher = (*_createOplogFetcherFn)( + *_attemptExec, + std::make_unique( + _sharedData.get(), _opts.oplogFetcherMaxFetcherRestarts), + _dataReplicatorExternalState.get(), + [=](OplogFetcher::Documents::const_iterator first, + OplogFetcher::Documents::const_iterator last, + const OplogFetcher::DocumentsInfo& info) { + return _enqueueDocuments(first, last, info); + }, + [=](const Status& s, int rbid) { _oplogFetcherCallback(s, onCompletionGuard); }, + std::move(oplogFetcherConfig)); + + LOGV2_DEBUG(21178, + 2, + "Starting OplogFetcher: {oplogFetcher}", + "Starting OplogFetcher", + "oplogFetcher"_attr = _oplogFetcher->toString()); + + // _startupComponent_inlock is shutdown-aware. + status = _startupComponent_inlock(_oplogFetcher); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + _initialSyncState->allDatabaseCloner.reset(); + return; + } + + if (MONGO_unlikely(initialSyncHangBeforeCopyingDatabasesFCB.shouldFail())) { + lock.unlock(); + // This could have been done with a scheduleWorkAt but this is used only by JS tests where + // we run with multiple threads so it's fine to spin on this thread. + // This log output is used in js tests so please leave it. + LOGV2(21179, + "initial sync - initialSyncHangBeforeCopyingDatabasesFCB fail point " + "enabled. Blocking until fail point is disabled."); + while (MONGO_unlikely(initialSyncHangBeforeCopyingDatabasesFCB.shouldFail()) && + !_isShuttingDown()) { + mongo::sleepsecs(1); + } + lock.lock(); + } + + LOGV2_DEBUG(21180, + 2, + "Starting AllDatabaseCloner: {allDatabaseCloner}", + "Starting AllDatabaseCloner", + "allDatabaseCloner"_attr = _initialSyncState->allDatabaseCloner->toString()); + + auto [startClonerFuture, startCloner] = + _initialSyncState->allDatabaseCloner->runOnExecutorEvent(*_clonerAttemptExec); + // runOnExecutorEvent ensures the future is not ready unless an error has occurred. + if (startClonerFuture.isReady()) { + status = startClonerFuture.getNoThrow(); + invariant(!status.isOK()); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + _initialSyncState->allDatabaseClonerFuture = + std::move(startClonerFuture).onCompletion([this, onCompletionGuard](Status status) mutable { + // The completion guard must run on the main executor, and never inline. In unit tests, + // without the executor call, it would run on the wrong executor. In both production + // and in unit tests, if the cloner finishes very quickly, the callback could run + // in-line and result in self-deadlock. + stdx::unique_lock lock(_mutex); + auto exec_status = (*_attemptExec) + ->scheduleWork([this, status, onCompletionGuard]( + executor::TaskExecutor::CallbackArgs args) { + _allDatabaseClonerCallback(status, onCompletionGuard); + }); + if (!exec_status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, + exec_status.getStatus()); + // In the shutdown case, it is possible the completion guard will be run + // from this thread (since the lambda holding another copy didn't schedule). + // If it does, we will self-deadlock if we're holding the lock, so release it. + lock.unlock(); + } + // In unit tests, this reset ensures the completion guard does not run during the + // destruction of the lambda (which occurs on the wrong executor), except in the + // shutdown case. + onCompletionGuard.reset(); + }); + lock.unlock(); + // Start (and therefore finish) the cloners outside the lock. This ensures onCompletion + // is not run with the mutex held, which would result in self-deadlock. + (*_clonerAttemptExec)->signalEvent(startCloner); +} + +void InitialSyncerFCB::_oplogFetcherCallback(const Status& oplogFetcherFinishStatus, + std::shared_ptr onCompletionGuard) { + stdx::lock_guard lock(_mutex); + LOGV2(21181, + "Finished fetching oplog during initial sync: {oplogFetcherFinishStatus}. Last fetched " + "optime: {lastFetched}", + "Finished fetching oplog during initial sync", + "oplogFetcherFinishStatus"_attr = redact(oplogFetcherFinishStatus), + "lastFetched"_attr = _lastFetched.toString()); + + auto status = _checkForShutdownAndConvertStatus_inlock( + oplogFetcherFinishStatus, "error fetching oplog during initial sync"); + + // When the OplogFetcher completes early (instead of being canceled at shutdown), we log and let + // our reference to 'onCompletionGuard' go out of scope. Since we know the + // DatabasesCloner/MultiApplier will still have a reference to it, the actual function within + // the guard won't be fired yet. + // It is up to the DatabasesCloner and MultiApplier to determine if they can proceed without any + // additional data going into the oplog buffer. + // It is not common for the OplogFetcher to return with an OK status. The only time it returns + // an OK status is when the 'stopReplProducer' fail point is enabled, which causes the + // OplogFetcher to ignore the current sync source response and return early. + if (status.isOK()) { + LOGV2(21182, + "Finished fetching oplog fetching early. Last fetched optime: {lastFetched}", + "Finished fetching oplog fetching early", + "lastFetched"_attr = _lastFetched.toString()); + return; + } + + // During normal operation, this call to onCompletion->setResultAndCancelRemainingWork_inlock + // is a no-op because the other thread running the DatabasesCloner or MultiApplier will already + // have called it with the success/failed status. + // The OplogFetcher does not finish on its own because of the oplog tailing query it runs on the + // sync source. The most common OplogFetcher completion status is CallbackCanceled due to either + // a shutdown request or completion of the data cloning and oplog application phases. + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); +} + +void InitialSyncerFCB::_allDatabaseClonerCallback( + const Status& databaseClonerFinishStatus, + std::shared_ptr onCompletionGuard) { + LOGV2(21183, + "Finished cloning data: {databaseClonerFinishStatus}. Beginning oplog replay.", + "Finished cloning data. Beginning oplog replay", + "databaseClonerFinishStatus"_attr = redact(databaseClonerFinishStatus)); + _client->shutdownAndDisallowReconnect(); + + if (MONGO_unlikely(initialSyncHangAfterDataCloningFCB.shouldFail())) { + // This could have been done with a scheduleWorkAt but this is used only by JS tests where + // we run with multiple threads so it's fine to spin on this thread. + // This log output is used in js tests so please leave it. + LOGV2(21184, + "initial sync - initialSyncHangAfterDataCloningFCB fail point " + "enabled. Blocking until fail point is disabled."); + while (MONGO_unlikely(initialSyncHangAfterDataCloningFCB.shouldFail()) && + !_isShuttingDown()) { + mongo::sleepsecs(1); + } + } + + stdx::lock_guard lock(_mutex); + _client.reset(); + auto status = _checkForShutdownAndConvertStatus_inlock(databaseClonerFinishStatus, + "error cloning databases"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // Since the stopTimestamp is retrieved after we have done all the work of retrieving collection + // data, we handle retries within this class by retrying for + // 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours). This is the same retry + // strategy used when retrieving collection data, and avoids retrieving all the data and then + // throwing it away due to a transient network outage. + status = _scheduleLastOplogEntryFetcher_inlock( + [=](const StatusWith& status, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) { + _lastOplogEntryFetcherCallbackForStopTimestamp(status, onCompletionGuard); + }, + kInitialSyncerHandlesRetries); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForStopTimestamp( + const StatusWith& result, + std::shared_ptr onCompletionGuard) { + OpTimeAndWallTime resultOpTimeAndWallTime = {OpTime(), Date_t()}; + { + { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + result.getStatus(), "error fetching last oplog entry for stop timestamp"); + if (_shouldRetryError(lock, status)) { + auto scheduleStatus = + (*_attemptExec) + ->scheduleWork( + [this, onCompletionGuard](executor::TaskExecutor::CallbackArgs args) { + // It is not valid to schedule the retry from within this callback, + // hence we schedule a lambda to schedule the retry. + stdx::lock_guard lock(_mutex); + // Since the stopTimestamp is retrieved after we have done all the + // work of retrieving collection data, we handle retries within this + // class by retrying for + // 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours). + // This is the same retry strategy used when retrieving collection + // data, and avoids retrieving all the data and then throwing it + // away due to a transient network outage. + auto status = _scheduleLastOplogEntryFetcher_inlock( + [=](const StatusWith& status, + mongo::Fetcher::NextAction*, + mongo::BSONObjBuilder*) { + _lastOplogEntryFetcherCallbackForStopTimestamp( + status, onCompletionGuard); + }, + kInitialSyncerHandlesRetries); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, status); + } + }); + if (scheduleStatus.isOK()) + return; + // If scheduling failed, we're shutting down and cannot retry. + // So just continue with the original failed status. + } + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto&& optimeStatus = parseOpTimeAndWallTime(result); + if (!optimeStatus.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, + optimeStatus.getStatus()); + return; + } + resultOpTimeAndWallTime = optimeStatus.getValue(); + } + + // Release the _mutex to write to disk. + auto opCtx = makeOpCtx(); + _replicationProcess->getConsistencyMarkers()->setMinValid(opCtx.get(), + resultOpTimeAndWallTime.opTime); + + stdx::lock_guard lock(_mutex); + _initialSyncState->stopTimestamp = resultOpTimeAndWallTime.opTime.getTimestamp(); + + // If the beginFetchingTimestamp is different from the stopTimestamp, it indicates that + // there are oplog entries fetched by the oplog fetcher that need to be written to the oplog + // and/or there are operations that need to be applied. + if (_initialSyncState->beginFetchingTimestamp != _initialSyncState->stopTimestamp) { + invariant(_lastApplied.opTime.isNull()); + _checkApplierProgressAndScheduleGetNextApplierBatch_inlock(lock, onCompletionGuard); + return; + } + } + + // Oplog at sync source has not advanced since we started cloning databases, so we use the last + // oplog entry to seed the oplog before checking the rollback ID. + { + const auto& documents = result.getValue().documents; + invariant(!documents.empty()); + const BSONObj oplogSeedDoc = documents.front(); + LOGV2_DEBUG(21185, + 2, + "Inserting oplog seed document: {oplogSeedDocument}", + "Inserting oplog seed document", + "oplogSeedDocument"_attr = oplogSeedDoc); + + auto opCtx = makeOpCtx(); + // StorageInterface::insertDocument() has to be called outside the lock because we may + // override its behavior in tests. See InitialSyncerReturnsCallbackCanceledAndDoesNot- + // ScheduleRollbackCheckerIfShutdownAfterInsertingInsertOplogSeedDocument in + // initial_syncer_test.cpp + // + // Note that the initial seed oplog insertion is not timestamped, this is safe to do as the + // logic for navigating the oplog is reliant on the timestamp value of the oplog document + // itself. Additionally, this also prevents confusion in the storage engine as the last + // insertion can be produced at precisely the stable timestamp, which could lead to invalid + // data consistency due to the stable timestamp signalling that no operations before or at + // that point will be rolled back. So transactions shouldn't happen at precisely that point. + auto status = _storage->insertDocument(opCtx.get(), + NamespaceString::kRsOplogNamespace, + TimestampedBSONObj{oplogSeedDoc}, + resultOpTimeAndWallTime.opTime.getTerm()); + if (!status.isOK()) { + stdx::lock_guard lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + const bool orderedCommit = true; + _storage->oplogDiskLocRegister( + opCtx.get(), resultOpTimeAndWallTime.opTime.getTimestamp(), orderedCommit); + } + + stdx::lock_guard lock(_mutex); + _lastApplied = resultOpTimeAndWallTime; + LOGV2(21186, + "No need to apply operations. (currently at {stopTimestamp})", + "No need to apply operations", + "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON()); + + // This sets the error in 'onCompletionGuard' and shuts down the OplogFetcher on error. + _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); +} + +void InitialSyncerFCB::_getNextApplierBatchCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = + _checkForShutdownAndConvertStatus_inlock(callbackArgs, "error getting next applier batch"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto batchResult = _getNextApplierBatch_inlock(); + if (!batchResult.isOK()) { + LOGV2_WARNING(21196, + "Failure creating next apply batch: {error}", + "Failure creating next apply batch", + "error"_attr = redact(batchResult.getStatus())); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, batchResult.getStatus()); + return; + } + + std::string logMsg = str::stream() + << "Initial Syncer is about to apply the next oplog batch of size: " + << batchResult.getValue().size(); + pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); + + if (MONGO_unlikely(failInitialSyncBeforeApplyingBatchFCB.shouldFail())) { + LOGV2(21187, + "initial sync - failInitialSyncBeforeApplyingBatchFCB fail point enabled. Pausing " + "until " + "fail point is disabled, then will fail initial sync"); + failInitialSyncBeforeApplyingBatchFCB.pauseWhileSet(); + status = Status(ErrorCodes::CallbackCanceled, + "failInitialSyncBeforeApplyingBatchFCB fail point enabled"); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // Schedule MultiApplier if we have operations to apply. + const auto& ops = batchResult.getValue(); + if (!ops.empty()) { + _fetchCount.store(0); + MultiApplier::MultiApplyFn applyBatchOfOperationsFn = [this](OperationContext* opCtx, + std::vector ops) { + return _oplogApplier->applyOplogBatch(opCtx, std::move(ops)); + }; + OpTime lastApplied = ops.back().getOpTime(); + Date_t lastAppliedWall = ops.back().getWallClockTime(); + + auto numApplied = ops.size(); + MultiApplier::CallbackFn onCompletionFn = [=](const Status& s) { + return _multiApplierCallback( + s, {lastApplied, lastAppliedWall}, numApplied, onCompletionGuard); + }; + + _applier = std::make_unique( + *_attemptExec, ops, std::move(applyBatchOfOperationsFn), std::move(onCompletionFn)); + status = _startupComponent_inlock(_applier); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + return; + } + + // If the oplog fetcher is no longer running (completed successfully) and the oplog buffer is + // empty, we are not going to make any more progress with this initial sync. Report progress so + // far and return a RemoteResultsUnavailable error. + if (!_oplogFetcher->isActive()) { + static constexpr char msg[] = + "The oplog fetcher is no longer running and we have applied all the oplog entries " + "in the oplog buffer. Aborting this initial sync attempt"; + LOGV2(21188, + msg, + "lastApplied"_attr = _lastApplied.opTime, + "lastFetched"_attr = _lastFetched, + "operationsApplied"_attr = _initialSyncState->appliedOps); + status = Status(ErrorCodes::RemoteResultsUnavailable, + str::stream() + << msg << ". Last applied: " << _lastApplied.opTime.toString() + << ". Last fetched: " << _lastFetched.toString() + << ". Number of operations applied: " << _initialSyncState->appliedOps); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // If there are no operations at the moment to apply and the oplog fetcher is still waiting on + // the sync source, we'll check the oplog buffer again in + // '_opts.getApplierBatchCallbackRetryWait' ms. + auto when = (*_attemptExec)->now() + _opts.getApplierBatchCallbackRetryWait; + status = _scheduleWorkAtAndSaveHandle_inlock( + when, + [=](const CallbackArgs& args) { _getNextApplierBatchCallback(args, onCompletionGuard); }, + &_getNextApplierBatchHandle, + "_getNextApplierBatchCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +void InitialSyncerFCB::_multiApplierCallback(const Status& multiApplierStatus, + OpTimeAndWallTime lastApplied, + std::uint32_t numApplied, + std::shared_ptr onCompletionGuard) { + stdx::lock_guard lock(_mutex); + auto status = + _checkForShutdownAndConvertStatus_inlock(multiApplierStatus, "error applying batch"); + + // Set to cause initial sync to fassert instead of restart if applying a batch fails, so that + // tests can be robust to network errors but not oplog idempotency errors. + if (MONGO_unlikely(initialSyncFassertIfApplyingBatchFailsFCB.shouldFail())) { + LOGV2(21189, "initialSyncFassertIfApplyingBatchFailsFCB fail point enabled"); + fassert(31210, status); + } + + if (!status.isOK()) { + LOGV2_ERROR(21199, + "Failed to apply batch due to '{error}'", + "Failed to apply batch", + "error"_attr = redact(status)); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + _initialSyncState->appliedOps += numApplied; + _lastApplied = lastApplied; + const auto lastAppliedOpTime = _lastApplied.opTime; + _opts.setMyLastOptime(_lastApplied); + + // Update oplog visibility after applying a batch so that while applying transaction oplog + // entries, the TransactionHistoryIterator can get earlier oplog entries associated with the + // transaction. Note that setting the oplog visibility timestamp here will be safe even if + // initial sync was restarted because until initial sync ends, no one else will try to read our + // oplog. It is also safe even if we tried to read from our own oplog because we never try to + // read from the oplog before applying at least one batch and therefore setting a value for the + // oplog visibility timestamp. + auto opCtx = makeOpCtx(); + const bool orderedCommit = true; + _storage->oplogDiskLocRegister(opCtx.get(), lastAppliedOpTime.getTimestamp(), orderedCommit); + _checkApplierProgressAndScheduleGetNextApplierBatch_inlock(lock, onCompletionGuard); +} + +void InitialSyncerFCB::_rollbackCheckerCheckForRollbackCallback( + const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard) { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), + "error while getting last rollback ID"); + if (_shouldRetryError(lock, status)) { + LOGV2_DEBUG(21190, + 1, + "Retrying rollback checker because of network error {error}", + "Retrying rollback checker because of network error", + "error"_attr = status); + _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); + return; + } + + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto hasHadRollback = result.getValue(); + if (hasHadRollback) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::UnrecoverableRollbackError, + str::stream() << "Rollback occurred on our sync source " << _syncSource + << " during initial sync")); + return; + } + + if (MONGO_unlikely(initialSyncHangBeforeCompletingOplogFetchingFCB.shouldFail())) { + LOGV2(4599500, "initialSyncHangBeforeCompletingOplogFetchingFCB fail point enabled"); + initialSyncHangBeforeCompletingOplogFetchingFCB.pauseWhileSet(); + } + + // Success! + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, _lastApplied); +} + +void InitialSyncerFCB::_finishInitialSyncAttempt(const StatusWith& lastApplied) { + // Since _finishInitialSyncAttempt can be called from any component's callback function or + // scheduled task, it is possible that we may not be in a TaskExecutor-managed thread when this + // function is invoked. + // For example, if CollectionCloner fails while inserting documents into the + // CollectionBulkLoader, we will get here via one of CollectionCloner's TaskRunner callbacks + // which has an active OperationContext bound to the current Client. This would lead to an + // invariant when we attempt to create a new OperationContext for _tearDown(opCtx). + // To avoid this, we schedule _finishCallback against the TaskExecutor rather than calling it + // here synchronously. + + // Unless dismissed, a scope guard will schedule _finishCallback() upon exiting this function. + // Since it is a requirement that _finishCallback be called outside the lock (which is possible + // if the task scheduling fails and we have to invoke _finishCallback() synchronously), we + // declare the scope guard before the lock guard. + auto result = lastApplied; + ScopeGuard finishCallbackGuard([this, &result] { + auto scheduleResult = _exec->scheduleWork( + [=](const mongo::executor::TaskExecutor::CallbackArgs&) { _finishCallback(result); }); + if (!scheduleResult.isOK()) { + LOGV2_WARNING(21197, + "Unable to schedule initial syncer completion task due to " + "{error}. Running callback on current thread.", + "Unable to schedule initial syncer completion task. Running callback on " + "current thread", + "error"_attr = redact(scheduleResult.getStatus())); + _finishCallback(result); + } + }); + + LOGV2(21191, "Initial sync attempt finishing up"); + + stdx::lock_guard lock(_mutex); + + auto runTime = _initialSyncState ? _initialSyncState->timer.millis() : 0; + int rollBackId = -1; + int operationsRetried = 0; + int totalTimeUnreachableMillis = 0; + if (_sharedData) { + stdx::lock_guard sdLock(*_sharedData); + rollBackId = _sharedData->getRollBackId(); + operationsRetried = _sharedData->getTotalRetries(sdLock); + totalTimeUnreachableMillis = + durationCount(_sharedData->getTotalTimeUnreachable(sdLock)); + } + + if (MONGO_unlikely(failAndHangInitialSyncFCB.shouldFail())) { + LOGV2(21193, "failAndHangInitialSyncFCB fail point enabled"); + failAndHangInitialSyncFCB.pauseWhileSet(); + result = Status(ErrorCodes::InternalError, "failAndHangInitialSyncFCB fail point enabled"); + } + + _stats.initialSyncAttemptInfos.emplace_back( + InitialSyncerFCB::InitialSyncAttemptInfo{runTime, + result.getStatus(), + _syncSource, + rollBackId, + operationsRetried, + totalTimeUnreachableMillis}); + + if (!result.isOK()) { + // This increments the number of failed attempts for the current initial sync request. + ++_stats.failedInitialSyncAttempts; + // This increments the number of failed attempts across all initial sync attempts since + // process startup. + initial_sync_common_stats::initialSyncFailedAttempts.increment(); + } + + bool hasRetries = _stats.failedInitialSyncAttempts < _stats.maxFailedInitialSyncAttempts; + + initial_sync_common_stats::LogInitialSyncAttemptStats( + result, hasRetries, _getInitialSyncProgress_inlock()); + + if (result.isOK()) { + // Scope guard will invoke _finishCallback(). + return; + } + + LOGV2_ERROR(21200, + "Initial sync attempt failed -- attempts left: " + "{attemptsLeft} cause: " + "{error}", + "Initial sync attempt failed", + "attemptsLeft"_attr = + (_stats.maxFailedInitialSyncAttempts - _stats.failedInitialSyncAttempts), + "error"_attr = redact(result.getStatus())); + + // Check if need to do more retries. + if (!hasRetries) { + LOGV2_FATAL_CONTINUE(21202, + "The maximum number of retries have been exhausted for initial sync"); + + initial_sync_common_stats::initialSyncFailures.increment(); + + // Scope guard will invoke _finishCallback(). + return; + } + + _attemptExec = std::make_unique( + _exec, Status(ErrorCodes::CallbackCanceled, "Initial Sync Attempt Canceled")); + _clonerAttemptExec = std::make_unique( + _clonerExec, Status(ErrorCodes::CallbackCanceled, "Initial Sync Attempt Canceled")); + _attemptCanceled = false; + auto when = (*_attemptExec)->now() + _opts.initialSyncRetryWait; + auto status = _scheduleWorkAtAndSaveHandle_inlock( + when, + [=](const executor::TaskExecutor::CallbackArgs& args) { + _startInitialSyncAttemptCallback( + args, _stats.failedInitialSyncAttempts, _stats.maxFailedInitialSyncAttempts); + }, + &_startInitialSyncAttemptHandle, + str::stream() << "_startInitialSyncAttemptCallback-" << _stats.failedInitialSyncAttempts); + + if (!status.isOK()) { + result = status; + // Scope guard will invoke _finishCallback(). + return; + } + + // Next initial sync attempt scheduled successfully and we do not need to call _finishCallback() + // until the next initial sync attempt finishes. + finishCallbackGuard.dismiss(); +} + +void InitialSyncerFCB::_finishCallback(StatusWith lastApplied) { + // After running callback function, clear '_onCompletion' to release any resources that might be + // held by this function object. + // '_onCompletion' must be moved to a temporary copy and destroyed outside the lock in case + // there is any logic that's invoked at the function object's destruction that might call into + // this InitialSyncerFCB. 'onCompletion' must be destroyed outside the lock and this should + // happen before we transition the state to Complete. + decltype(_onCompletion) onCompletion; + { + stdx::lock_guard lock(_mutex); + auto opCtx = makeOpCtx(); + _tearDown_inlock(opCtx.get(), lastApplied); + invariant(_onCompletion); + std::swap(_onCompletion, onCompletion); + } + + if (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail())) { + // This log output is used in js tests so please leave it. + LOGV2(21194, + "initial sync - initialSyncHangBeforeFinishFCB fail point " + "enabled. Blocking until fail point is disabled."); + while (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail()) && !_isShuttingDown()) { + mongo::sleepsecs(1); + } + } + + // Any _retryingOperation is no longer active. This must be done before signalling state + // Complete. + _retryingOperation = boost::none; + + // Completion callback must be invoked outside mutex. + try { + onCompletion(lastApplied); + } catch (...) { + LOGV2_WARNING(21198, + "initial syncer finish callback threw exception: {error}", + "Initial syncer finish callback threw exception", + "error"_attr = redact(exceptionToStatus())); + } + + // Destroy the remaining reference to the completion callback before we transition the state to + // Complete so that callers can expect any resources bound to '_onCompletion' to be released + // before InitialSyncerFCB::join() returns. + onCompletion = {}; + + { + stdx::lock_guard lock(_mutex); + invariant(_state != State::kComplete); + _state = State::kComplete; + _stateCondition.notify_all(); + + // Clear the initial sync progress after an initial sync attempt has been successfully + // completed. + if (lastApplied.isOK() && !MONGO_unlikely(skipClearInitialSyncStateFCB.shouldFail())) { + _initialSyncState.reset(); + } + + // Destroy shared references to executors. + _attemptExec = nullptr; + _clonerAttemptExec = nullptr; + _clonerExec = nullptr; + _exec = nullptr; + } + + if (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail())) { + LOGV2(5825800, + "initial sync finished - initialSyncHangAfterFinishFCB fail point " + "enabled. Blocking until fail point is disabled."); + while (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail()) && !_isShuttingDown()) { + mongo::sleepsecs(1); + } + } +} + +Status InitialSyncerFCB::_scheduleLastOplogEntryFetcher_inlock( + Fetcher::CallbackFn callback, LastOplogEntryFetcherRetryStrategy retryStrategy) { + BSONObj query = + BSON("find" << NamespaceString::kRsOplogNamespace.coll() << "sort" << BSON("$natural" << -1) + << "limit" << 1 << ReadConcernArgs::kReadConcernFieldName + << ReadConcernArgs::kLocal); + + _lastOplogEntryFetcher = std::make_unique( + *_attemptExec, + _syncSource, + NamespaceString::kRsOplogNamespace.db().toString(), + query, + callback, + ReadPreferenceSetting::secondaryPreferredMetadata(), + RemoteCommandRequest::kNoTimeout /* find network timeout */, + RemoteCommandRequest::kNoTimeout /* getMore network timeout */, + (retryStrategy == kFetcherHandlesRetries) + ? RemoteCommandRetryScheduler::makeRetryPolicy( + numInitialSyncOplogFindAttempts.load(), + executor::RemoteCommandRequest::kNoTimeout) + : RemoteCommandRetryScheduler::makeNoRetryPolicy()); + Status scheduleStatus = _lastOplogEntryFetcher->schedule(); + if (!scheduleStatus.isOK()) { + _lastOplogEntryFetcher.reset(); + } + + return scheduleStatus; +} + +void InitialSyncerFCB::_checkApplierProgressAndScheduleGetNextApplierBatch_inlock( + const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard) { + // We should check our current state because shutdown() could have been called before + // we re-acquired the lock. + if (_isShuttingDown_inlock()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::CallbackCanceled, + "failed to schedule applier to check for " + "rollback: initial syncer is shutting down")); + return; + } + + // Basic sanity check on begin/stop timestamps. + if (_initialSyncState->beginApplyingTimestamp > _initialSyncState->stopTimestamp) { + static constexpr char msg[] = "Possible rollback on sync source"; + LOGV2_ERROR(21201, + msg, + "syncSource"_attr = _syncSource, + "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON(), + "beginApplyingTimestamp"_attr = + _initialSyncState->beginApplyingTimestamp.toBSON()); + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::OplogOutOfOrder, + str::stream() << msg << " " << _syncSource.toString() << ". Currently at " + << _initialSyncState->stopTimestamp.toBSON() << ". Started at " + << _initialSyncState->beginApplyingTimestamp.toBSON())); + return; + } + + if (_lastApplied.opTime.isNull()) { + // Check if any ops occurred while cloning or any ops need to be fetched. + invariant(_initialSyncState->beginFetchingTimestamp < _initialSyncState->stopTimestamp); + LOGV2(21195, + "Writing to the oplog and applying operations until {stopTimestamp} " + "before initial sync can complete. (started fetching at " + "{beginFetchingTimestamp} and applying at " + "{beginApplyingTimestamp})", + "Writing to the oplog and applying operations until stopTimestamp before initial " + "sync can complete", + "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON(), + "beginFetchingTimestamp"_attr = _initialSyncState->beginFetchingTimestamp.toBSON(), + "beginApplyingTimestamp"_attr = _initialSyncState->beginApplyingTimestamp.toBSON()); + // Fall through to scheduling _getNextApplierBatchCallback(). + } else if (_lastApplied.opTime.getTimestamp() >= _initialSyncState->stopTimestamp) { + // Check for rollback if we have applied far enough to be consistent. + invariant(!_lastApplied.opTime.getTimestamp().isNull()); + _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); + return; + } + + // Get another batch to apply. + // _scheduleWorkAndSaveHandle_inlock() is shutdown-aware. + auto status = _scheduleWorkAndSaveHandle_inlock( + [=](const executor::TaskExecutor::CallbackArgs& args) { + return _getNextApplierBatchCallback(args, onCompletionGuard); + }, + &_getNextApplierBatchHandle, + "_getNextApplierBatchCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + +void InitialSyncerFCB::_scheduleRollbackCheckerCheckForRollback_inlock( + const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard) { + // We should check our current state because shutdown() could have been called before + // we re-acquired the lock. + if (_isShuttingDown_inlock()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + Status(ErrorCodes::CallbackCanceled, + "failed to schedule rollback checker to check " + "for rollback: initial syncer is shutting " + "down")); + return; + } + + auto scheduleResult = + _rollbackChecker->checkForRollback([=](const RollbackChecker::Result& result) { + _rollbackCheckerCheckForRollbackCallback(result, onCompletionGuard); + }); + + auto status = scheduleResult.getStatus(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + _getLastRollbackIdHandle = scheduleResult.getValue(); +} + +bool InitialSyncerFCB::_shouldRetryError(WithLock lk, Status status) { + if (ErrorCodes::isRetriableError(status)) { + stdx::lock_guard sharedDataLock(*_sharedData); + return _sharedData->shouldRetryOperation(sharedDataLock, &_retryingOperation); + } + // The status was OK or some error other than a retriable error, so clear the retriable error + // state and indicate that we should not retry. + _clearRetriableError(lk); + return false; +} + +void InitialSyncerFCB::_clearRetriableError(WithLock lk) { + _retryingOperation = boost::none; +} + +Status InitialSyncerFCB::_checkForShutdownAndConvertStatus_inlock( + const executor::TaskExecutor::CallbackArgs& callbackArgs, const std::string& message) { + return _checkForShutdownAndConvertStatus_inlock(callbackArgs.status, message); +} + +Status InitialSyncerFCB::_checkForShutdownAndConvertStatus_inlock(const Status& status, + const std::string& message) { + + if (_isShuttingDown_inlock()) { + return {ErrorCodes::CallbackCanceled, message + ": initial syncer is shutting down"}; + } + + return status.withContext(message); +} + +Status InitialSyncerFCB::_scheduleWorkAndSaveHandle_inlock( + executor::TaskExecutor::CallbackFn work, + executor::TaskExecutor::CallbackHandle* handle, + const std::string& name) { + invariant(handle); + if (_isShuttingDown_inlock()) { + return {ErrorCodes::CallbackCanceled, + str::stream() << "failed to schedule work " << name + << ": initial syncer is shutting down"}; + } + auto result = (*_attemptExec)->scheduleWork(std::move(work)); + if (!result.isOK()) { + return result.getStatus().withContext(str::stream() << "failed to schedule work " << name); + } + *handle = result.getValue(); + return Status::OK(); +} + +Status InitialSyncerFCB::_scheduleWorkAtAndSaveHandle_inlock( + Date_t when, + executor::TaskExecutor::CallbackFn work, + executor::TaskExecutor::CallbackHandle* handle, + const std::string& name) { + invariant(handle); + if (_isShuttingDown_inlock()) { + return {ErrorCodes::CallbackCanceled, + str::stream() << "failed to schedule work " << name << " at " << when.toString() + << ": initial syncer is shutting down"}; + } + auto result = (*_attemptExec)->scheduleWorkAt(when, std::move(work)); + if (!result.isOK()) { + return result.getStatus().withContext(str::stream() << "failed to schedule work " << name + << " at " << when.toString()); + } + *handle = result.getValue(); + return Status::OK(); +} + +void InitialSyncerFCB::_cancelHandle_inlock(executor::TaskExecutor::CallbackHandle handle) { + if (!handle) { + return; + } + (*_attemptExec)->cancel(handle); +} + +template +Status InitialSyncerFCB::_startupComponent_inlock(Component& component) { + // It is necessary to check if shutdown or attempt cancelling happens before starting a + // component; otherwise the component may call a callback function in line which will + // cause a deadlock when the callback attempts to obtain the initial syncer mutex. + if (_isShuttingDown_inlock() || _attemptCanceled) { + component.reset(); + if (_isShuttingDown_inlock()) { + return {ErrorCodes::CallbackCanceled, + "initial syncer shutdown while trying to call startup() on component"}; + } else { + return {ErrorCodes::CallbackCanceled, + "initial sync attempt canceled while trying to call startup() on component"}; + } + } + auto status = component->startup(); + if (!status.isOK()) { + component.reset(); + } + return status; +} + +template +void InitialSyncerFCB::_shutdownComponent_inlock(Component& component) { + if (!component) { + return; + } + component->shutdown(); +} + +StatusWith> InitialSyncerFCB::_getNextApplierBatch_inlock() { + // If the fail-point is active, delay the apply batch by returning an empty batch so that + // _getNextApplierBatchCallback() will reschedule itself at a later time. + // See InitialSyncerInterface::Options::getApplierBatchCallbackRetryWait. + if (MONGO_unlikely(rsSyncApplyStopFCB.shouldFail())) { + return std::vector(); + } + + // Obtain next batch of operations from OplogApplier. + auto opCtx = makeOpCtx(); + OplogApplier::BatchLimits batchLimits; + batchLimits.bytes = replBatchLimitBytes.load(); + batchLimits.ops = getBatchLimitOplogEntries(); + // We want a batch boundary after the beginApplyingTimestamp, to make sure all oplog entries + // that are part of a transaction before that timestamp are written out before we start applying + // entries after them. This is because later entries may be commit or prepare and thus + // expect to read the partial entries from the oplog. + batchLimits.forceBatchBoundaryAfter = _initialSyncState->beginApplyingTimestamp; + return _oplogApplier->getNextApplierBatch(opCtx.get(), batchLimits); +} + +StatusWith InitialSyncerFCB::_chooseSyncSource_inlock() { + auto syncSource = _opts.syncSourceSelector->chooseNewSyncSource(_lastFetched); + if (syncSource.empty()) { + return Status{ErrorCodes::InvalidSyncSource, + str::stream() << "No valid sync source available. Our last fetched optime: " + << _lastFetched.toString()}; + } + return syncSource; +} + +Status InitialSyncerFCB::_enqueueDocuments(OplogFetcher::Documents::const_iterator begin, + OplogFetcher::Documents::const_iterator end, + const OplogFetcher::DocumentsInfo& info) { + if (info.toApplyDocumentCount == 0) { + return Status::OK(); + } + + if (_isShuttingDown()) { + return Status::OK(); + } + + invariant(_oplogBuffer); + + // Wait for enough space. + _oplogApplier->waitForSpace(makeOpCtx().get(), info.toApplyDocumentBytes); + + // Buffer docs for later application. + _oplogApplier->enqueue(makeOpCtx().get(), begin, end); + + _lastFetched = info.lastDocument; + + // TODO: updates metrics with "info". + return Status::OK(); +} + +std::string InitialSyncerFCB::Stats::toString() const { + return toBSON().toString(); +} + +BSONObj InitialSyncerFCB::Stats::toBSON() const { + BSONObjBuilder bob; + append(&bob); + return bob.obj(); +} + +void InitialSyncerFCB::Stats::append(BSONObjBuilder* builder) const { + builder->appendNumber("failedInitialSyncAttempts", + static_cast(failedInitialSyncAttempts)); + builder->appendNumber("maxFailedInitialSyncAttempts", + static_cast(maxFailedInitialSyncAttempts)); + + auto e = exec.lock(); + if (initialSyncStart != Date_t()) { + builder->appendDate("initialSyncStart", initialSyncStart); + auto elapsedDurationEnd = e ? e->now() : Date_t::now(); + if (initialSyncEnd != Date_t()) { + builder->appendDate("initialSyncEnd", initialSyncEnd); + elapsedDurationEnd = initialSyncEnd; + } + long long elapsedMillis = + duration_cast(elapsedDurationEnd - initialSyncStart).count(); + builder->appendNumber("totalInitialSyncElapsedMillis", elapsedMillis); + } + + BSONArrayBuilder arrBuilder(builder->subarrayStart("initialSyncAttempts")); + for (auto const& attemptInfo : initialSyncAttemptInfos) { + arrBuilder.append(attemptInfo.toBSON()); + } + arrBuilder.doneFast(); +} + +std::string InitialSyncerFCB::InitialSyncAttemptInfo::toString() const { + return toBSON().toString(); +} + +BSONObj InitialSyncerFCB::InitialSyncAttemptInfo::toBSON() const { + BSONObjBuilder bob; + append(&bob); + return bob.obj(); +} + +void InitialSyncerFCB::InitialSyncAttemptInfo::append(BSONObjBuilder* builder) const { + builder->appendNumber("durationMillis", durationMillis); + builder->append("status", status.toString()); + builder->append("syncSource", syncSource.toString()); + if (rollBackId >= 0) { + builder->append("rollBackId", rollBackId); + } + builder->append("operationsRetried", operationsRetried); + builder->append("totalTimeUnreachableMillis", totalTimeUnreachableMillis); +} + +bool InitialSyncerFCB::OplogFetcherRestartDecisionInitialSyncer::shouldContinue( + OplogFetcher* fetcher, Status status) { + if (ErrorCodes::isRetriableError(status)) { + stdx::lock_guard lk(*_sharedData); + return _sharedData->shouldRetryOperation(lk, &_retryingOperation); + } + // A non-network error occured, so clear any network error and use the default restart + // strategy. + _retryingOperation = boost::none; + return _defaultDecision.shouldContinue(fetcher, status); +} + +void InitialSyncerFCB::OplogFetcherRestartDecisionInitialSyncer::fetchSuccessful( + OplogFetcher* fetcher) { + _retryingOperation = boost::none; + _defaultDecision.fetchSuccessful(fetcher); +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h new file mode 100644 index 0000000000000..54f271fcf106c --- /dev/null +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -0,0 +1,726 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + + +#pragma once + +#include +#include +#include +#include +#include + +#include "mongo/base/status.h" +#include "mongo/base/status_with.h" +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/client/fetcher.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/repl/callback_completion_guard.h" +#include "mongo/db/repl/data_replicator_external_state.h" +#include "mongo/db/repl/initial_sync_shared_data.h" +#include "mongo/db/repl/initial_syncer_interface.h" +#include "mongo/db/repl/multiapplier.h" +#include "mongo/db/repl/oplog_applier.h" +#include "mongo/db/repl/oplog_buffer.h" +#include "mongo/db/repl/oplog_entry.h" +#include "mongo/db/repl/oplog_fetcher.h" +#include "mongo/db/repl/optime.h" +#include "mongo/db/repl/rollback_checker.h" +#include "mongo/executor/scoped_task_executor.h" +#include "mongo/executor/task_executor.h" +#include "mongo/platform/atomic_word.h" +#include "mongo/platform/mutex.h" +#include "mongo/stdx/condition_variable.h" +#include "mongo/stdx/mutex.h" +#include "mongo/util/concurrency/thread_pool.h" +#include "mongo/util/concurrency/with_lock.h" +#include "mongo/util/duration.h" +#include "mongo/util/fail_point.h" +#include "mongo/util/net/hostandport.h" +#include "mongo/util/time_support.h" + +namespace mongo { +namespace repl { + +// TODO: Remove forward declares once we remove rs_initialsync.cpp and other dependents. +// Failpoint which fails initial sync and leaves an oplog entry in the buffer. +extern FailPoint failInitSyncWithBufferedEntriesLeftFCB; + +// Failpoint which causes the initial sync function to hang before copying databases. +extern FailPoint initialSyncHangBeforeCopyingDatabasesFCB; + +// Failpoint which stops the applier. +extern FailPoint rsSyncApplyStopFCB; + +struct InitialSyncState; +class ReplicationProcess; +class StorageInterface; + +/** + * The initial syncer provides services to keep collection in sync by replicating + * changes via an oplog source to the local system storage. + * + * This class will use existing machinery like the Executor to schedule work and + * network tasks, as well as provide serial access and synchronization of state. + * + * + * Entry Points: + * -- startup: Start initial sync. + */ +class InitialSyncerFCB : public InitialSyncerInterface { +public: + InitialSyncerFCB(const InitialSyncerFCB&) = delete; + InitialSyncerFCB& operator=(const InitialSyncerFCB&) = delete; + InitialSyncerFCB(InitialSyncerFCB&&) = delete; + InitialSyncerFCB& operator=(InitialSyncerFCB&&) = delete; + + /** + * Callback completion guard for initial syncer. + */ + using OnCompletionGuard = CallbackCompletionGuard>; + + struct InitialSyncAttemptInfo { + int durationMillis; + Status status; + HostAndPort syncSource; + int rollBackId; + int operationsRetried; + int totalTimeUnreachableMillis; + + std::string toString() const; + BSONObj toBSON() const; + void append(BSONObjBuilder* builder) const; + }; + + class OplogFetcherRestartDecisionInitialSyncer + : public OplogFetcher::OplogFetcherRestartDecision { + + public: + OplogFetcherRestartDecisionInitialSyncer(InitialSyncSharedData* sharedData, + std::size_t maxFetcherRestarts) + : _sharedData(sharedData), _defaultDecision(maxFetcherRestarts){}; + + bool shouldContinue(OplogFetcher* fetcher, Status status) final; + + void fetchSuccessful(OplogFetcher* fetcher) final; + + private: + InitialSyncSharedData* _sharedData; + + // We delegate to the default strategy when it's a non-network error. + OplogFetcher::OplogFetcherRestartDecisionDefault _defaultDecision; + + // The operation, if any, currently being retried because of a network error. + InitialSyncSharedData::RetryableOperation _retryingOperation; + }; + + struct Stats { + std::uint32_t failedInitialSyncAttempts{0}; + std::uint32_t maxFailedInitialSyncAttempts{0}; + Date_t initialSyncStart; + Date_t initialSyncEnd; + std::vector initialSyncAttemptInfos; + std::weak_ptr exec; + + std::string toString() const; + BSONObj toBSON() const; + void append(BSONObjBuilder* builder) const; + }; + + InitialSyncerFCB(InitialSyncerInterface::Options opts, + std::unique_ptr dataReplicatorExternalState, + ThreadPool* writerPool, + StorageInterface* storage, + ReplicationProcess* replicationProcess, + const OnCompletionFn& onCompletion); + + ~InitialSyncerFCB() override; + + /** + * Returns true if an initial sync is currently running or in the process of shutting down. + */ + bool isActive() const; + + std::string getInitialSyncMethod() const final; + + bool allowLocalDbAccess() const final { + return true; + } + + Status startup(OperationContext* opCtx, std::uint32_t maxAttempts) noexcept final; + + Status shutdown() final; + + void join() final; + + /** + * Returns internal state in a loggable format. + */ + std::string getDiagnosticString() const; + + BSONObj getInitialSyncProgress() const final; + + void cancelCurrentAttempt() final; + + /** + * + * Overrides how the initial syncer creates the client. + * + * For testing only + */ + void setCreateClientFn_forTest(const CreateClientFn& createClientFn); + + /** + * + * Overrides how the initial syncer creates the OplogFetcher. + * + * For testing only. + */ + void setCreateOplogFetcherFn_forTest(std::unique_ptr createOplogFetcherFn); + + /** + * + * Get a raw pointer to the OplogFetcher. Block up to 10s until the underlying OplogFetcher has + * started. It is the caller's responsibility to not reuse this pointer beyond the lifetime of + * the underlying OplogFetcher. + * + * For testing only. + */ + OplogFetcher* getOplogFetcher_forTest() const; + + /** + * + * Provides a separate executor for the cloners, so network operations based on + * TaskExecutor::scheduleRemoteCommand() can use the NetworkInterfaceMock while the cloners + * are stopped on a failpoint. + * + * For testing only + */ + void setClonerExecutor_forTest(std::shared_ptr clonerExec); + + /** + * + * Wait for the cloner thread to finish. + * + * For testing only + */ + void waitForCloner_forTest(); + + // State transitions: + // PreStart --> Running --> ShuttingDown --> Complete + // It is possible to skip intermediate states. For example, calling shutdown() when the data + // replicator has not started will transition from PreStart directly to Complete. + enum class State { kPreStart, kRunning, kShuttingDown, kComplete }; + + /** + * Returns current initial syncer state. + * For testing only. + */ + State getState_forTest() const; + + /** + * Returns the wall clock time component of _lastApplied. + * For testing only. + */ + Date_t getWallClockTime_forTest() const; + + /** + * Sets the allowed outage duration in _sharedData. + * For testing only. + */ + void setAllowedOutageDuration_forTest(Milliseconds allowedOutageDuration); + +private: + enum LastOplogEntryFetcherRetryStrategy { + kFetcherHandlesRetries, + kInitialSyncerHandlesRetries + }; + + /** + * Returns true if we are still processing initial sync tasks (_state is either Running or + * Shutdown). + */ + bool _isActive_inlock() const; + + /** + * Cancels all outstanding work. + * Used by shutdown() and CompletionGuard::setResultAndCancelRemainingWork(). + */ + void _cancelRemainingWork_inlock(); + + /** + * Returns true if the initial syncer has received a shutdown request (_state is ShuttingDown). + */ + bool _isShuttingDown() const; + bool _isShuttingDown_inlock() const; + + /** + * Initial sync flowchart: + * + * start() + * | + * | + * V + * _setUp_inlock() + * | + * | + * V + * _startInitialSyncAttemptCallback() + * | + * | + * |<-------+ + * | | + * | | (bad sync source) + * | | + * V | + * _chooseSyncSourceCallback() + * | + * | + * | (good sync source found) + * | + * | + * V + * _truncateOplogAndDropReplicatedDatabases() + * | + * | + * V + * _rollbackCheckerResetCallback() + * | + * | + * V + * _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime() + * | + * | + * V + * _getBeginFetchingOpTimeCallback() + * | + * | + * V + * _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp() + * | + * | + * V + * _fcvFetcherCallback() + * | + * | + * +------------------------------+ + * | | + * | | + * V V + * _oplogFetcherCallback() _allDatabaseClonerCallback + * | | + * | | + * | V + * | _lastOplogEntryFetcherCallbackForStopTimestamp() + * | | | + * | | | + * | (no ops to apply) | | (have ops to apply) + * | | | + * | | V + * | | _getNextApplierBatchCallback() + * | | | ^ + * | | | | + * | | | (end ts not reached) + * | | | | + * | | V | + * | | _multiApplierCallback()-----+ + * | | | + * | | | + * | (reached end timestamp) + * | | | + * | V V + * | _rollbackCheckerCheckForRollbackCallback() + * | | + * | | + * +------------------------------+ + * | + * | + * V + * _finishInitialSyncAttempt() + * | + * | + * V + * _finishCallback() + */ + + /** + * Sets up internal state to begin initial sync. + */ + void _setUp_inlock(OperationContext* opCtx, std::uint32_t initialSyncMaxAttempts); + + /** + * Tears down internal state before reporting final status to caller. + */ + void _tearDown_inlock(OperationContext* opCtx, + const StatusWith& lastApplied); + + /** + * Callback to start a single initial sync attempt. + */ + void _startInitialSyncAttemptCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::uint32_t initialSyncAttempt, + std::uint32_t initialSyncMaxAttempts) noexcept; + + /** + * Callback to obtain sync source from sync source selector. + * For every initial sync attempt, we will try up to 'numInitialSyncConnectAttempts' times (at + * an interval of '_opts.syncSourceRetryWait' ms) to obtain a valid sync source before giving up + * and returning ErrorCodes::InitialSyncOplogSourceMissing. + */ + void _chooseSyncSourceCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::uint32_t chooseSyncSourceAttempt, + std::uint32_t chooseSyncSourceMaxAttempts, + std::shared_ptr onCompletionGuard) noexcept; + + /** + * This function does the following: + * 1.) Truncate oplog. + * 2.) Drop user databases (replicated dbs). + */ + Status _truncateOplogAndDropReplicatedDatabases(); + + /** + * Callback for rollback checker's first replSetGetRBID command before starting data cloning. + */ + void _rollbackCheckerResetCallback(const RollbackChecker::Result& result, + std::shared_ptr onCompletionGuard); + + /** + * Callback for first '_lastOplogEntryFetcher' callback. A successful response lets us + * determine the default starting point for tailing the oplog using the OplogFetcher if there + * are no active transactions on the sync source. This will be used as the default for the + * beginFetchingTimestamp. + */ + void _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime( + const StatusWith& result, + std::shared_ptr onCompletionGuard); + + /** + * Schedules a remote command to issue a find command on sync source's transaction table, which + * will get us the optime of the oldest active transaction on that node. It will be used as the + * beginFetchingTimestamp. + */ + Status _scheduleGetBeginFetchingOpTime_inlock( + std::shared_ptr onCompletionGuard, + const OpTime& defaultBeginFetchingOpTime); + + /** + * Callback that gets the optime of the oldest active transaction in the sync source's + * transaction table. It will be used as the beginFetchingTimestamp. + */ + void _getBeginFetchingOpTimeCallback(const StatusWith& result, + std::shared_ptr onCompletionGuard, + const OpTime& defaultBeginFetchingOpTime); + + /** + * Callback for second '_lastOplogEntryFetcher' callback. A successful response lets us + * determine the starting point for applying oplog entries during the oplog application phase + * as well as setting a reference point for the state of the sync source's oplog when data + * cloning completes. + */ + void _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( + const StatusWith& result, + std::shared_ptr onCompletionGuard, + OpTime& beginFetchingOpTime); + + /** + * Callback for the '_fCVFetcher'. A successful response lets us check if the remote node + * is in a currently acceptable fCV and if it has a 'targetVersion' set. + */ + void _fcvFetcherCallback(const StatusWith& result, + std::shared_ptr onCompletionGuard, + const OpTime& lastOpTime, + OpTime& beginFetchingOpTime); + + /** + * Callback for oplog fetcher. + */ + void _oplogFetcherCallback(const Status& status, + std::shared_ptr onCompletionGuard); + + /** + * Callback for DatabasesCloner. + */ + void _allDatabaseClonerCallback(const Status& status, + std::shared_ptr onCompletionGuard); + + /** + * Callback for second '_lastOplogEntryFetcher' callback. This is scheduled to obtain the stop + * timestamp after DatabasesCloner has completed and enables us to determine if the oplog on + * the sync source has advanced since we started cloning the databases. + */ + void _lastOplogEntryFetcherCallbackForStopTimestamp( + const StatusWith& result, + std::shared_ptr onCompletionGuard); + + /** + * Callback to obtain next batch of operations to apply. + */ + void _getNextApplierBatchCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + + /** + * Callback for MultiApplier completion. + */ + void _multiApplierCallback(const Status& status, + OpTimeAndWallTime lastApplied, + std::uint32_t numApplied, + std::shared_ptr onCompletionGuard); + + /** + * Callback for rollback checker's last replSetGetRBID command after cloning data and applying + * operations. + */ + void _rollbackCheckerCheckForRollbackCallback( + const RollbackChecker::Result& result, + std::shared_ptr onCompletionGuard); + + /** + * Reports result of current initial sync attempt. May schedule another initial sync attempt + * depending on shutdown state and whether we've exhausted all initial sync retries. + */ + void _finishInitialSyncAttempt(const StatusWith& lastApplied); + + /** + * Invokes completion callback and transitions state to State::kComplete. + */ + void _finishCallback(StatusWith lastApplied); + + // Obtains a valid sync source from the sync source selector. + // Returns error if a sync source cannot be found. + StatusWith _chooseSyncSource_inlock(); + + /** + * Pushes documents from oplog fetcher to blocking queue for + * applier to consume. + * + * Returns a status even though it always returns OK, to conform the interface OplogFetcher + * expects for the EnqueueDocumentsFn. + */ + Status _enqueueDocuments(OplogFetcher::Documents::const_iterator begin, + OplogFetcher::Documents::const_iterator end, + const OplogFetcher::DocumentsInfo& info); + + void _appendInitialSyncProgressMinimal_inlock(BSONObjBuilder* bob) const; + BSONObj _getInitialSyncProgress_inlock() const; + + StatusWith> _getNextApplierBatch_inlock(); + + /** + * Schedules a fetcher to get the last oplog entry from the sync source. + * + * If 'retryStrategy' is 'kFetcherHandlesRetries', the fetcher will retry up to the server + * parameter 'numInitialSyncOplogFindAttempts' times. Otherwise any failures must be handled by + * the caller. + */ + Status _scheduleLastOplogEntryFetcher_inlock(Fetcher::CallbackFn callback, + LastOplogEntryFetcherRetryStrategy retryStrategy); + + /** + * Checks the current oplog application progress (begin and end timestamps). + * If necessary, schedules a _getNextApplierBatchCallback() task. + * If the stop and end timestamps are inconsistent or if there is an issue scheduling the task, + * we set the error status in 'onCompletionGuard' and shut down the OplogFetcher. + * Passes 'lock' through to completion guard. + */ + void _checkApplierProgressAndScheduleGetNextApplierBatch_inlock( + const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard); + + /** + * Schedules a rollback checker to get the rollback ID after data cloning or applying. This + * helps us check if a rollback occurred on the sync source. + * If we fail to schedule the rollback checker, we set the error status in 'onCompletionGuard' + * and shut down the OplogFetcher. + * Passes 'lock' through to completion guard. + */ + void _scheduleRollbackCheckerCheckForRollback_inlock( + const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard); + + /** + * Check if a status is one which means there's a retriable error and we should retry the + * current operation, and records whether an operation is currently being retried. Note this + * can only handle one operation at a time (i.e. it should not be used in both parts of the + * "split" section of Initial Sync) + */ + bool _shouldRetryError(WithLock lk, Status status); + + /** + * Indicates we are no longer handling a retriable error. + */ + void _clearRetriableError(WithLock lk); + + /** + * Checks the given status (or embedded status inside the callback args) and current data + * replicator shutdown state. If the given status is not OK or if we are shutting down, returns + * a new error status that should be passed to _finishCallback. The reason in the new error + * status will include 'message'. + * Otherwise, returns Status::OK(). + */ + Status _checkForShutdownAndConvertStatus_inlock( + const executor::TaskExecutor::CallbackArgs& callbackArgs, const std::string& message); + Status _checkForShutdownAndConvertStatus_inlock(const Status& status, + const std::string& message); + + /** + * Schedules work to be run by the task executor. + * Saves handle if work was successfully scheduled. + * Returns scheduleWork status (without the handle). + */ + Status _scheduleWorkAndSaveHandle_inlock(executor::TaskExecutor::CallbackFn work, + executor::TaskExecutor::CallbackHandle* handle, + const std::string& name); + Status _scheduleWorkAtAndSaveHandle_inlock(Date_t when, + executor::TaskExecutor::CallbackFn work, + executor::TaskExecutor::CallbackHandle* handle, + const std::string& name); + + /** + * Cancels task executor callback handle if not null. + */ + void _cancelHandle_inlock(executor::TaskExecutor::CallbackHandle handle); + + /** + * Starts up component and checks initial syncer's shutdown state at the same time. + * If component's startup() fails, resets 'component' (which is assumed to be a unique_ptr + * to the component type). + */ + template + Status _startupComponent_inlock(Component& component); + + /** + * Shuts down component if not null. + */ + template + void _shutdownComponent_inlock(Component& component); + + // Counts how many documents have been refetched from the source in the current batch. + AtomicWord _fetchCount; + + // + // All member variables are labeled with one of the following codes indicating the + // synchronization rules for accessing them. + // + // (R) Read-only in concurrent operation; no synchronization required. + // (S) Self-synchronizing; access in any way from any context. + // (M) Reads and writes guarded by _mutex + // (X) Reads and writes must be performed in a callback in _exec + // (MX) Must hold _mutex and be in a callback in _exec to write; must either hold + // _mutex or be in a callback in _exec to read. + + mutable Mutex _mutex = MONGO_MAKE_LATCH("InitialSyncerFCB::_mutex"); // (S) + const InitialSyncerInterface::Options _opts; // (R) + std::unique_ptr _dataReplicatorExternalState; // (R) + std::shared_ptr _exec; // (R) + std::unique_ptr _attemptExec; // (X) + // The executor that the Cloner thread runs on. In production code this is the same as _exec, + // but for unit testing, _exec is single-threaded and our NetworkInterfaceMock runs it in + // lockstep with the unit test code. If we pause the cloners using failpoints + // NetworkInterfaceMock is unaware of this and this causes our unit tests to deadlock. + std::shared_ptr _clonerExec; // (R) + std::unique_ptr _clonerAttemptExec; // (X) + ThreadPool* _writerPool; // (R) + StorageInterface* _storage; // (R) + ReplicationProcess* _replicationProcess; // (S) + + // This is invoked with the final status of the initial sync. If startup() fails, this callback + // is never invoked. The caller gets the last applied optime when the initial sync completes + // successfully or an error status. + // '_onCompletion' is cleared on completion (in _finishCallback()) in order to release any + // resources that might be held by the callback function object. + OnCompletionFn _onCompletion; // (M) + + // Handle to currently scheduled _startInitialSyncAttemptCallback() task. + executor::TaskExecutor::CallbackHandle _startInitialSyncAttemptHandle; // (M) + + // Handle to currently scheduled _chooseSyncSourceCallback() task. + executor::TaskExecutor::CallbackHandle _chooseSyncSourceHandle; // (M) + + // RollbackChecker to get rollback ID before and after each initial sync attempt. + std::unique_ptr _rollbackChecker; // (M) + + // Handle returned from RollbackChecker::reset(). + RollbackChecker::CallbackHandle _getBaseRollbackIdHandle; // (M) + + // Handle returned from RollbackChecker::checkForRollback(). + RollbackChecker::CallbackHandle _getLastRollbackIdHandle; // (M) + + // Handle to currently scheduled _getNextApplierBatchCallback() task. + executor::TaskExecutor::CallbackHandle _getNextApplierBatchHandle; // (M) + + // The operation, if any, currently being retried because of a network error. + InitialSyncSharedData::RetryableOperation _retryingOperation; // (M) + + std::unique_ptr _initialSyncState; // (M) + std::unique_ptr _oplogFetcher; // (S) + std::unique_ptr _beginFetchingOpTimeFetcher; // (S) + std::unique_ptr _lastOplogEntryFetcher; // (S) + std::unique_ptr _fCVFetcher; // (S) + std::unique_ptr _applier; // (M) + HostAndPort _syncSource; // (M) + std::unique_ptr _client; // (M) + OpTime _lastFetched; // (MX) + OpTimeAndWallTime _lastApplied; // (MX) + + std::unique_ptr _oplogBuffer; // (M) + std::unique_ptr _oplogApplier; // (M) + + // Used to signal changes in _state. + mutable stdx::condition_variable _stateCondition; + + // Current initial syncer state. See comments for State enum class for details. + State _state = State::kPreStart; // (M) + + // Used to create the DBClientConnection for the cloners + CreateClientFn _createClientFn; + + // Used to create the OplogFetcher for the InitialSyncerFCB. + std::unique_ptr _createOplogFetcherFn; + + // Contains stats on the current initial sync request (includes all attempts). + // To access these stats in a user-readable format, use getInitialSyncProgress(). + Stats _stats; // (M) + + // Data shared by cloners and fetcher. Follow InitialSyncSharedData synchronization rules. + std::unique_ptr _sharedData; // (S) + + // Amount of time an outage is allowed to continue before the initial sync attempt is marked + // as failed. + Milliseconds _allowedOutageDuration; // (M) + + // The initial sync attempt has been canceled + bool _attemptCanceled = false; // (X) +}; + +} // namespace repl +} // namespace mongo From 4d4cd942c52c679f1dfca751f34066b4ebdfff85 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 22 Apr 2024 23:14:48 +0100 Subject: [PATCH 04/32] PSMDB-1284 do not set initial sync flag --- src/mongo/db/repl/initial_syncer_fcb.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index c4c30b2527927..44f6e40157cf3 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -537,7 +537,6 @@ void InitialSyncerFCB::waitForCloner_forTest() { void InitialSyncerFCB::_setUp_inlock(OperationContext* opCtx, std::uint32_t initialSyncMaxAttempts) { // 'opCtx' is passed through from startup(). - _replicationProcess->getConsistencyMarkers()->setInitialSyncFlag(opCtx); _replicationProcess->getConsistencyMarkers()->clearInitialSyncId(opCtx); auto* serviceCtx = opCtx->getServiceContext(); @@ -583,12 +582,8 @@ void InitialSyncerFCB::_tearDown_inlock(OperationContext* opCtx, _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx); - // We set the initial data timestamp before clearing the initial sync flag. See comments in - // clearInitialSyncFlag. _storage->setInitialDataTimestamp(opCtx->getServiceContext(), initialDataTimestamp); - _replicationProcess->getConsistencyMarkers()->clearInitialSyncFlag(opCtx); - auto currentLastAppliedOpTime = _opts.getMyLastOptime(); if (currentLastAppliedOpTime.isNull()) { _opts.setMyLastOptime(lastApplied.getValue()); From 6289219cce237886e80c2c0695ef697cd77db022 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Sun, 12 May 2024 23:36:01 +0100 Subject: [PATCH 05/32] PSMDB-1284 remove most parts related to logical sync --- src/mongo/db/repl/initial_syncer_fcb.cpp | 1030 +--------------------- src/mongo/db/repl/initial_syncer_fcb.h | 219 +---- 2 files changed, 12 insertions(+), 1237 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 44f6e40157cf3..fe3aeace5751e 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -31,6 +31,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "initial_syncer_fcb.h" +#include #include #include #include @@ -43,7 +44,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/bson/bsonobjbuilder.h" #include "mongo/bson/timestamp.h" #include "mongo/client/fetcher.h" -#include "mongo/client/remote_command_retry_scheduler.h" #include "mongo/db/client.h" #include "mongo/db/feature_compatibility_version_parser.h" #include "mongo/db/index_builds_coordinator.h" @@ -53,8 +53,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/initial_syncer_common_stats.h" #include "mongo/db/repl/initial_syncer_factory.h" #include "mongo/db/repl/initial_syncer_interface.h" -#include "mongo/db/repl/oplog_buffer.h" -#include "mongo/db/repl/oplog_fetcher.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/repl_server_parameters_gen.h" #include "mongo/db/repl/replication_consistency_markers.h" @@ -64,7 +62,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/tenant_migration_access_blocker_util.h" #include "mongo/db/repl/transaction_oplog_application.h" #include "mongo/db/serverless/serverless_operation_lock_registry.h" -#include "mongo/db/session/session_txn_record_gen.h" #include "mongo/executor/task_executor.h" #include "mongo/logv2/log.h" #include "mongo/platform/compiler.h" // IWYU pragma: keep @@ -87,13 +84,6 @@ namespace repl { // Failpoint for initial sync MONGO_FAIL_POINT_DEFINE(failInitialSyncWithBadHostFCB); -// Failpoint which fails initial sync and leaves an oplog entry in the buffer. -MONGO_FAIL_POINT_DEFINE(failInitSyncWithBufferedEntriesLeftFCB); - -// Failpoint which causes the initial sync function to hang after getting the oldest active -// transaction timestamp from the sync source. -MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterGettingBeginFetchingTimestampFCB); - // Failpoint which causes the initial sync function to hang before creating shared data and // splitting control flow between the oplog fetcher and the cloners. MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeSplittingControlFlowFCB); @@ -107,37 +97,18 @@ MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeFinishFCB); // Failpoint which causes the initial sync function to hang before creating the oplog. MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCreatingOplogFCB); -// Failpoint which stops the applier. -MONGO_FAIL_POINT_DEFINE(rsSyncApplyStopFCB); - -// Failpoint which causes the initial sync function to hang after cloning all databases. -MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterDataCloningFCB); - // Failpoint which skips clearing _initialSyncState after a successful initial sync attempt. MONGO_FAIL_POINT_DEFINE(skipClearInitialSyncStateFCB); // Failpoint which causes the initial sync function to fail and hang before starting a new attempt. MONGO_FAIL_POINT_DEFINE(failAndHangInitialSyncFCB); -// Failpoint which fails initial sync before it applies the next batch of oplog entries. -MONGO_FAIL_POINT_DEFINE(failInitialSyncBeforeApplyingBatchFCB); - -// Failpoint which fasserts if applying a batch fails. -MONGO_FAIL_POINT_DEFINE(initialSyncFassertIfApplyingBatchFailsFCB); - -// Failpoint which causes the initial sync function to hang before stopping the oplog fetcher. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCompletingOplogFetchingFCB); - // Failpoint which causes the initial sync function to hang before choosing a sync source. MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeChoosingSyncSourceFCB); // Failpoint which causes the initial sync function to hang after finishing. MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterFinishFCB); -// Failpoints for synchronization, shared with cloners. -extern FailPoint initialSyncFuzzerSynchronizationPoint1; -extern FailPoint initialSyncFuzzerSynchronizationPoint2; - namespace { using namespace executor; using CallbackArgs = executor::TaskExecutor::CallbackArgs; @@ -154,36 +125,6 @@ ServiceContext::UniqueOperationContext makeOpCtx() { return cc().makeOperationContext(); } -StatusWith parseOpTimeAndWallTime(const QueryResponseStatus& fetchResult) { - if (!fetchResult.isOK()) { - return fetchResult.getStatus(); - } - const auto docs = fetchResult.getValue().documents; - const auto hasDoc = docs.begin() != docs.end(); - if (!hasDoc) { - return StatusWith{ErrorCodes::NoMatchingDocument, - "no oplog entry found"}; - } - - return OpTimeAndWallTime::parseOpTimeAndWallTimeFromOplogEntry(docs.front()); -} - -void pauseAtInitialSyncFuzzerSyncronizationPoints(std::string msg) { - // Set and unset by the InitialSyncTest fixture to cause initial sync to pause so that the - // Initial Sync Fuzzer can run commands on the sync source. - if (MONGO_unlikely(initialSyncFuzzerSynchronizationPoint1.shouldFail())) { - LOGV2(21158, - "initialSyncFuzzerSynchronizationPoint1 fail point enabled", - "failpointMessage"_attr = msg); - initialSyncFuzzerSynchronizationPoint1.pauseWhileSet(); - } - - if (MONGO_unlikely(initialSyncFuzzerSynchronizationPoint2.shouldFail())) { - LOGV2(21160, "initialSyncFuzzerSynchronizationPoint2 fail point enabled"); - initialSyncFuzzerSynchronizationPoint2.pauseWhileSet(); - } -} - } // namespace const ServiceContext::ConstructorActionRegisterer initialSyncerRegistererFCB( @@ -224,8 +165,7 @@ InitialSyncerFCB::InitialSyncerFCB( _replicationProcess(replicationProcess), _onCompletion(onCompletion), _createClientFn( - [] { return std::make_unique(true /* autoReconnect */); }), - _createOplogFetcherFn(CreateOplogFetcherFn::get()) { + [] { return std::make_unique(true /* autoReconnect */); }) { uassert(ErrorCodes::BadValue, "task executor cannot be null", _exec); uassert(ErrorCodes::BadValue, "invalid storage interface", _storage); uassert(ErrorCodes::BadValue, "invalid replication process", _replicationProcess); @@ -337,11 +277,8 @@ void InitialSyncerFCB::cancelCurrentAttempt() { void InitialSyncerFCB::_cancelRemainingWork_inlock() { _cancelHandle_inlock(_startInitialSyncAttemptHandle); _cancelHandle_inlock(_chooseSyncSourceHandle); - _cancelHandle_inlock(_getBaseRollbackIdHandle); _cancelHandle_inlock(_getLastRollbackIdHandle); - _cancelHandle_inlock(_getNextApplierBatchHandle); - _shutdownComponent_inlock(_oplogFetcher); if (_sharedData) { // We actually hold the required lock, but the lock object itself is not passed through. _clearRetriableError(WithLock::withoutLock()); @@ -354,7 +291,6 @@ void InitialSyncerFCB::_cancelRemainingWork_inlock() { } _shutdownComponent_inlock(_applier); _shutdownComponent_inlock(_fCVFetcher); - _shutdownComponent_inlock(_lastOplogEntryFetcher); _shutdownComponent_inlock(_beginFetchingOpTimeFetcher); (*_attemptExec)->shutdown(); (*_clonerAttemptExec)->shutdown(); @@ -397,8 +333,7 @@ bool InitialSyncerFCB::_isShuttingDown_inlock() const { std::string InitialSyncerFCB::getDiagnosticString() const { LockGuard lk(_mutex); str::stream out; - out << "InitialSyncerFCB -" << " oplogFetcher: " << _oplogFetcher->toString() - << " opsBuffered: " << _oplogBuffer->getSize() << " active: " << _isActive_inlock() + out << "InitialSyncerFCB -" << " active: " << _isActive_inlock() << " shutting down: " << _isShuttingDown_inlock(); if (_initialSyncState) { out << " opsAppied: " << _initialSyncState->appliedOps; @@ -504,27 +439,6 @@ void InitialSyncerFCB::setCreateClientFn_forTest(const CreateClientFn& createCli _createClientFn = createClientFn; } -void InitialSyncerFCB::setCreateOplogFetcherFn_forTest( - std::unique_ptr createOplogFetcherFn) { - LockGuard lk(_mutex); - _createOplogFetcherFn = std::move(createOplogFetcherFn); -} - -OplogFetcher* InitialSyncerFCB::getOplogFetcher_forTest() const { - // Wait up to 10 seconds. - for (auto i = 0; i < 100; i++) { - { - LockGuard lk(_mutex); - if (_oplogFetcher) { - return _oplogFetcher.get(); - } - } - sleepmillis(100); - } - invariant(false, "Timed out getting OplogFetcher pointer for test"); - return nullptr; -} - void InitialSyncerFCB::setClonerExecutor_forTest( std::shared_ptr clonerExec) { _clonerExec = std::move(clonerExec); @@ -543,10 +457,6 @@ void InitialSyncerFCB::_setUp_inlock(OperationContext* opCtx, _storage->setInitialDataTimestamp(serviceCtx, Timestamp::kAllowUnstableCheckpointsSentinel); _storage->setStableTimestamp(serviceCtx, Timestamp::min()); - LOGV2_DEBUG(21162, 1, "Creating oplogBuffer"); - _oplogBuffer = _dataReplicatorExternalState->makeInitialSyncOplogBuffer(opCtx); - _oplogBuffer->startup(opCtx); - _stats.initialSyncStart = _exec->now(); _stats.maxFailedInitialSyncAttempts = initialSyncMaxAttempts; _stats.failedInitialSyncAttempts = 0; @@ -559,10 +469,6 @@ void InitialSyncerFCB::_tearDown_inlock(OperationContext* opCtx, const StatusWith& lastApplied) { _stats.initialSyncEnd = _exec->now(); - // This might not be necessary if we failed initial sync. - invariant(_oplogBuffer); - _oplogBuffer->shutdown(opCtx); - if (!lastApplied.isOK()) { return; } @@ -637,8 +543,6 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( // has to run outside lock. stdx::lock_guard lock(_mutex); - _oplogApplier = {}; - LOGV2_DEBUG( 21165, 2, "Resetting sync source so a new one can be chosen for this initial sync attempt"); _syncSource = HostAndPort(); @@ -666,9 +570,6 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( "server configuration collection (admin.system.version)"); serverGlobalParams.mutableFCV.reset(); - // Clear the oplog buffer. - _oplogBuffer->clear(makeOpCtx().get()); - // Get sync source. std::uint32_t chooseSyncSourceAttempt = 0; std::uint32_t chooseSyncSourceMaxAttempts = @@ -779,17 +680,6 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( _syncSource = syncSource.getValue(); - // Schedule rollback ID checker. - _rollbackChecker = std::make_unique(*_attemptExec, _syncSource); - auto scheduleResult = _rollbackChecker->reset([=](const RollbackChecker::Result& result) { - return _rollbackCheckerResetCallback(result, onCompletionGuard); - }); - status = scheduleResult.getStatus(); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - _getBaseRollbackIdHandle = scheduleResult.getValue(); } catch (const DBException&) { // Report exception as an initial syncer failure. stdx::unique_lock lock(_mutex); @@ -848,246 +738,6 @@ Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { return _storage->dropReplicatedDatabases(opCtx.get()); } -void InitialSyncerFCB::_rollbackCheckerResetCallback( - const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard) { - stdx::lock_guard lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), - "error while getting base rollback ID"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // Since the beginFetchingOpTime is retrieved before significant work is done copying - // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy - // which retries up to 'numInitialSyncOplogFindAttempts' times'. This will fail relatively - // quickly in the presence of network errors, allowing us to choose a different sync source. - status = _scheduleLastOplogEntryFetcher_inlock( - [=](const StatusWith& response, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) mutable { - _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime(response, - onCompletionGuard); - }, - kFetcherHandlesRetries); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} - -void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime( - const StatusWith& result, - std::shared_ptr onCompletionGuard) { - - stdx::unique_lock lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock( - result.getStatus(), "error while getting last oplog entry for begin timestamp"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - const auto opTimeResult = parseOpTimeAndWallTime(result); - status = opTimeResult.getStatus(); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // This is the top of the oplog before we query for the oldest active transaction timestamp. If - // that query returns that there are no active transactions, we will use this as the - // beginFetchingTimestamp. - const auto& defaultBeginFetchingOpTime = opTimeResult.getValue().opTime; - - std::string logMsg = str::stream() << "Initial Syncer got the defaultBeginFetchingTimestamp: " - << defaultBeginFetchingOpTime.toString(); - pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); - LOGV2_DEBUG(6608900, - 1, - "Initial Syncer got the defaultBeginFetchingOpTime", - "defaultBeginFetchingOpTime"_attr = defaultBeginFetchingOpTime); - - status = _scheduleGetBeginFetchingOpTime_inlock(onCompletionGuard, defaultBeginFetchingOpTime); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} - -Status InitialSyncerFCB::_scheduleGetBeginFetchingOpTime_inlock( - std::shared_ptr onCompletionGuard, - const OpTime& defaultBeginFetchingOpTime) { - - const auto preparedState = DurableTxnState_serializer(DurableTxnStateEnum::kPrepared); - const auto inProgressState = DurableTxnState_serializer(DurableTxnStateEnum::kInProgress); - - // Obtain the oldest active transaction timestamp from the remote by querying their transactions - // table. To prevent oplog holes (primary) or a stale lastAppliedSnapshot (secondary) from - // causing this query to return an inaccurate timestamp, we specify an afterClusterTime of the - // defaultBeginFetchingOpTime so that we wait for all previous writes to be visible. - BSONObjBuilder cmd; - cmd.append("find", NamespaceString::kSessionTransactionsTableNamespace.coll().toString()); - cmd.append("filter", - BSON("state" << BSON("$in" << BSON_ARRAY(preparedState << inProgressState)))); - cmd.append("sort", BSON(SessionTxnRecord::kStartOpTimeFieldName << 1)); - cmd.append("readConcern", - BSON("level" - << "local" - << "afterClusterTime" << defaultBeginFetchingOpTime.getTimestamp())); - cmd.append("limit", 1); - - _beginFetchingOpTimeFetcher = std::make_unique( - *_attemptExec, - _syncSource, - NamespaceString::kSessionTransactionsTableNamespace.db().toString(), - cmd.obj(), - [=](const StatusWith& response, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) mutable { - _getBeginFetchingOpTimeCallback( - response, onCompletionGuard, defaultBeginFetchingOpTime); - }, - ReadPreferenceSetting::secondaryPreferredMetadata(), - RemoteCommandRequest::kNoTimeout /* find network timeout */, - RemoteCommandRequest::kNoTimeout /* getMore network timeout */, - RemoteCommandRetryScheduler::makeRetryPolicy( - numInitialSyncOplogFindAttempts.load(), executor::RemoteCommandRequest::kNoTimeout)); - Status scheduleStatus = _beginFetchingOpTimeFetcher->schedule(); - if (!scheduleStatus.isOK()) { - _beginFetchingOpTimeFetcher.reset(); - } - return scheduleStatus; -} - -void InitialSyncerFCB::_getBeginFetchingOpTimeCallback( - const StatusWith& result, - std::shared_ptr onCompletionGuard, - const OpTime& defaultBeginFetchingOpTime) { - stdx::unique_lock lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock( - result.getStatus(), - "error while getting oldest active transaction timestamp for begin fetching timestamp"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - const auto docs = result.getValue().documents; - if (docs.size() > 1) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - Status(ErrorCodes::TooManyMatchingDocuments, - str::stream() << "Expected to receive one document for the oldest active " - "transaction entry, but received: " - << docs.size() << ". First: " << redact(docs.front()) - << ". Last: " << redact(docs.back()))); - return; - } - - // Set beginFetchingOpTime if the oldest active transaction timestamp actually exists. Otherwise - // use the sync source's top of the oplog from before querying for the oldest active transaction - // timestamp. This will mean that even if a transaction is started on the sync source after - // querying for the oldest active transaction timestamp, the node will still fetch its oplog - // entries. - OpTime beginFetchingOpTime = defaultBeginFetchingOpTime; - if (!docs.empty()) { - auto entry = SessionTxnRecord::parse( - IDLParserContext("oldest active transaction optime for initial sync"), docs.front()); - auto optime = entry.getStartOpTime(); - if (optime) { - beginFetchingOpTime = optime.value(); - } - } - - std::string logMsg = str::stream() - << "Initial Syncer got the beginFetchingTimestamp: " << beginFetchingOpTime.toString(); - pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); - - if (MONGO_unlikely(initialSyncHangAfterGettingBeginFetchingTimestampFCB.shouldFail())) { - LOGV2(21176, "initialSyncHangAfterGettingBeginFetchingTimestampFCB fail point enabled"); - initialSyncHangAfterGettingBeginFetchingTimestampFCB.pauseWhileSet(); - } - - // Since the beginFetchingOpTime is retrieved before significant work is done copying - // data from the sync source, we allow the OplogEntryFetcher to use its default retry strategy - // which retries up to 'numInitialSyncOplogFindAttempts' times'. This will fail relatively - // quickly in the presence of network errors, allowing us to choose a different sync source. - status = _scheduleLastOplogEntryFetcher_inlock( - [=](const StatusWith& response, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) mutable { - _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( - response, onCompletionGuard, beginFetchingOpTime); - }, - kFetcherHandlesRetries); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} - -void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( - const StatusWith& result, - std::shared_ptr onCompletionGuard, - OpTime& beginFetchingOpTime) { - stdx::unique_lock lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock( - result.getStatus(), "error while getting last oplog entry for begin timestamp"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - const auto opTimeResult = parseOpTimeAndWallTime(result); - status = opTimeResult.getStatus(); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - const auto& lastOpTime = opTimeResult.getValue().opTime; - - std::string logMsg = str::stream() - << "Initial Syncer got the beginApplyingTimestamp: " << lastOpTime.toString(); - pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); - - BSONObjBuilder queryBob; - queryBob.append("find", NamespaceString::kServerConfigurationNamespace.coll()); - auto filterBob = BSONObjBuilder(queryBob.subobjStart("filter")); - filterBob.append("_id", multiversion::kParameterName); - filterBob.done(); - // As part of reading the FCV, we ensure the source node's all_durable timestamp has advanced - // to at least the timestamp of the last optime that we found in the lastOplogEntryFetcher. - // When document locking is used, there could be oplog "holes" which would result in - // inconsistent initial sync data if we didn't do this. - auto readConcernBob = BSONObjBuilder(queryBob.subobjStart("readConcern")); - readConcernBob.append("afterClusterTime", lastOpTime.getTimestamp()); - readConcernBob.done(); - - _fCVFetcher = std::make_unique( - *_attemptExec, - _syncSource, - NamespaceString::kServerConfigurationNamespace.db().toString(), - queryBob.obj(), - [=](const StatusWith& response, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) mutable { - _fcvFetcherCallback(response, onCompletionGuard, lastOpTime, beginFetchingOpTime); - }, - ReadPreferenceSetting::secondaryPreferredMetadata(), - RemoteCommandRequest::kNoTimeout /* find network timeout */, - RemoteCommandRequest::kNoTimeout /* getMore network timeout */, - RemoteCommandRetryScheduler::makeRetryPolicy( - numInitialSyncOplogFindAttempts.load(), executor::RemoteCommandRequest::kNoTimeout)); - Status scheduleStatus = _fCVFetcher->schedule(); - if (!scheduleStatus.isOK()) { - _fCVFetcher.reset(); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, scheduleStatus); - return; - } -} - void InitialSyncerFCB::_fcvFetcherCallback(const StatusWith& result, std::shared_ptr onCompletionGuard, const OpTime& lastOpTime, @@ -1169,17 +819,6 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWith(std::make_unique( _sharedData.get(), _syncSource, _client.get(), _storage, _writerPool)); - // Create oplog applier. - auto* consistencyMarkers = _replicationProcess->getConsistencyMarkers(); - OplogApplier::Options options(OplogApplication::Mode::kInitialSync); - options.beginApplyingOpTime = lastOpTime; - _oplogApplier = _dataReplicatorExternalState->makeOplogApplier(_oplogBuffer.get(), - &noopOplogApplierObserver, - consistencyMarkers, - _storage, - options, - _writerPool); - _initialSyncState->beginApplyingTimestamp = lastOpTime.getTimestamp(); _initialSyncState->beginFetchingTimestamp = beginFetchingOpTime.getTimestamp(); @@ -1209,42 +848,6 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWithgetBaseRBID(), - initialSyncOplogFetcherBatchSize, - OplogFetcher::RequireFresherSyncSource::kDontRequireFresherSyncSource); - oplogFetcherConfig.startingPoint = OplogFetcher::StartingPoint::kEnqueueFirstDoc; - _oplogFetcher = (*_createOplogFetcherFn)( - *_attemptExec, - std::make_unique( - _sharedData.get(), _opts.oplogFetcherMaxFetcherRestarts), - _dataReplicatorExternalState.get(), - [=](OplogFetcher::Documents::const_iterator first, - OplogFetcher::Documents::const_iterator last, - const OplogFetcher::DocumentsInfo& info) { - return _enqueueDocuments(first, last, info); - }, - [=](const Status& s, int rbid) { _oplogFetcherCallback(s, onCompletionGuard); }, - std::move(oplogFetcherConfig)); - - LOGV2_DEBUG(21178, - 2, - "Starting OplogFetcher: {oplogFetcher}", - "Starting OplogFetcher", - "oplogFetcher"_attr = _oplogFetcher->toString()); - - // _startupComponent_inlock is shutdown-aware. - status = _startupComponent_inlock(_oplogFetcher); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - _initialSyncState->allDatabaseCloner.reset(); - return; - } - if (MONGO_unlikely(initialSyncHangBeforeCopyingDatabasesFCB.shouldFail())) { lock.unlock(); // This could have been done with a scheduleWorkAt but this is used only by JS tests where @@ -1260,447 +863,7 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWithallDatabaseCloner->toString()); - - auto [startClonerFuture, startCloner] = - _initialSyncState->allDatabaseCloner->runOnExecutorEvent(*_clonerAttemptExec); - // runOnExecutorEvent ensures the future is not ready unless an error has occurred. - if (startClonerFuture.isReady()) { - status = startClonerFuture.getNoThrow(); - invariant(!status.isOK()); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - _initialSyncState->allDatabaseClonerFuture = - std::move(startClonerFuture).onCompletion([this, onCompletionGuard](Status status) mutable { - // The completion guard must run on the main executor, and never inline. In unit tests, - // without the executor call, it would run on the wrong executor. In both production - // and in unit tests, if the cloner finishes very quickly, the callback could run - // in-line and result in self-deadlock. - stdx::unique_lock lock(_mutex); - auto exec_status = (*_attemptExec) - ->scheduleWork([this, status, onCompletionGuard]( - executor::TaskExecutor::CallbackArgs args) { - _allDatabaseClonerCallback(status, onCompletionGuard); - }); - if (!exec_status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, - exec_status.getStatus()); - // In the shutdown case, it is possible the completion guard will be run - // from this thread (since the lambda holding another copy didn't schedule). - // If it does, we will self-deadlock if we're holding the lock, so release it. - lock.unlock(); - } - // In unit tests, this reset ensures the completion guard does not run during the - // destruction of the lambda (which occurs on the wrong executor), except in the - // shutdown case. - onCompletionGuard.reset(); - }); lock.unlock(); - // Start (and therefore finish) the cloners outside the lock. This ensures onCompletion - // is not run with the mutex held, which would result in self-deadlock. - (*_clonerAttemptExec)->signalEvent(startCloner); -} - -void InitialSyncerFCB::_oplogFetcherCallback(const Status& oplogFetcherFinishStatus, - std::shared_ptr onCompletionGuard) { - stdx::lock_guard lock(_mutex); - LOGV2(21181, - "Finished fetching oplog during initial sync: {oplogFetcherFinishStatus}. Last fetched " - "optime: {lastFetched}", - "Finished fetching oplog during initial sync", - "oplogFetcherFinishStatus"_attr = redact(oplogFetcherFinishStatus), - "lastFetched"_attr = _lastFetched.toString()); - - auto status = _checkForShutdownAndConvertStatus_inlock( - oplogFetcherFinishStatus, "error fetching oplog during initial sync"); - - // When the OplogFetcher completes early (instead of being canceled at shutdown), we log and let - // our reference to 'onCompletionGuard' go out of scope. Since we know the - // DatabasesCloner/MultiApplier will still have a reference to it, the actual function within - // the guard won't be fired yet. - // It is up to the DatabasesCloner and MultiApplier to determine if they can proceed without any - // additional data going into the oplog buffer. - // It is not common for the OplogFetcher to return with an OK status. The only time it returns - // an OK status is when the 'stopReplProducer' fail point is enabled, which causes the - // OplogFetcher to ignore the current sync source response and return early. - if (status.isOK()) { - LOGV2(21182, - "Finished fetching oplog fetching early. Last fetched optime: {lastFetched}", - "Finished fetching oplog fetching early", - "lastFetched"_attr = _lastFetched.toString()); - return; - } - - // During normal operation, this call to onCompletion->setResultAndCancelRemainingWork_inlock - // is a no-op because the other thread running the DatabasesCloner or MultiApplier will already - // have called it with the success/failed status. - // The OplogFetcher does not finish on its own because of the oplog tailing query it runs on the - // sync source. The most common OplogFetcher completion status is CallbackCanceled due to either - // a shutdown request or completion of the data cloning and oplog application phases. - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); -} - -void InitialSyncerFCB::_allDatabaseClonerCallback( - const Status& databaseClonerFinishStatus, - std::shared_ptr onCompletionGuard) { - LOGV2(21183, - "Finished cloning data: {databaseClonerFinishStatus}. Beginning oplog replay.", - "Finished cloning data. Beginning oplog replay", - "databaseClonerFinishStatus"_attr = redact(databaseClonerFinishStatus)); - _client->shutdownAndDisallowReconnect(); - - if (MONGO_unlikely(initialSyncHangAfterDataCloningFCB.shouldFail())) { - // This could have been done with a scheduleWorkAt but this is used only by JS tests where - // we run with multiple threads so it's fine to spin on this thread. - // This log output is used in js tests so please leave it. - LOGV2(21184, - "initial sync - initialSyncHangAfterDataCloningFCB fail point " - "enabled. Blocking until fail point is disabled."); - while (MONGO_unlikely(initialSyncHangAfterDataCloningFCB.shouldFail()) && - !_isShuttingDown()) { - mongo::sleepsecs(1); - } - } - - stdx::lock_guard lock(_mutex); - _client.reset(); - auto status = _checkForShutdownAndConvertStatus_inlock(databaseClonerFinishStatus, - "error cloning databases"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // Since the stopTimestamp is retrieved after we have done all the work of retrieving collection - // data, we handle retries within this class by retrying for - // 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours). This is the same retry - // strategy used when retrieving collection data, and avoids retrieving all the data and then - // throwing it away due to a transient network outage. - status = _scheduleLastOplogEntryFetcher_inlock( - [=](const StatusWith& status, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) { - _lastOplogEntryFetcherCallbackForStopTimestamp(status, onCompletionGuard); - }, - kInitialSyncerHandlesRetries); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} - -void InitialSyncerFCB::_lastOplogEntryFetcherCallbackForStopTimestamp( - const StatusWith& result, - std::shared_ptr onCompletionGuard) { - OpTimeAndWallTime resultOpTimeAndWallTime = {OpTime(), Date_t()}; - { - { - stdx::lock_guard lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock( - result.getStatus(), "error fetching last oplog entry for stop timestamp"); - if (_shouldRetryError(lock, status)) { - auto scheduleStatus = - (*_attemptExec) - ->scheduleWork( - [this, onCompletionGuard](executor::TaskExecutor::CallbackArgs args) { - // It is not valid to schedule the retry from within this callback, - // hence we schedule a lambda to schedule the retry. - stdx::lock_guard lock(_mutex); - // Since the stopTimestamp is retrieved after we have done all the - // work of retrieving collection data, we handle retries within this - // class by retrying for - // 'initialSyncTransientErrorRetryPeriodSeconds' (default 24 hours). - // This is the same retry strategy used when retrieving collection - // data, and avoids retrieving all the data and then throwing it - // away due to a transient network outage. - auto status = _scheduleLastOplogEntryFetcher_inlock( - [=](const StatusWith& status, - mongo::Fetcher::NextAction*, - mongo::BSONObjBuilder*) { - _lastOplogEntryFetcherCallbackForStopTimestamp( - status, onCompletionGuard); - }, - kInitialSyncerHandlesRetries); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, status); - } - }); - if (scheduleStatus.isOK()) - return; - // If scheduling failed, we're shutting down and cannot retry. - // So just continue with the original failed status. - } - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - auto&& optimeStatus = parseOpTimeAndWallTime(result); - if (!optimeStatus.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, - optimeStatus.getStatus()); - return; - } - resultOpTimeAndWallTime = optimeStatus.getValue(); - } - - // Release the _mutex to write to disk. - auto opCtx = makeOpCtx(); - _replicationProcess->getConsistencyMarkers()->setMinValid(opCtx.get(), - resultOpTimeAndWallTime.opTime); - - stdx::lock_guard lock(_mutex); - _initialSyncState->stopTimestamp = resultOpTimeAndWallTime.opTime.getTimestamp(); - - // If the beginFetchingTimestamp is different from the stopTimestamp, it indicates that - // there are oplog entries fetched by the oplog fetcher that need to be written to the oplog - // and/or there are operations that need to be applied. - if (_initialSyncState->beginFetchingTimestamp != _initialSyncState->stopTimestamp) { - invariant(_lastApplied.opTime.isNull()); - _checkApplierProgressAndScheduleGetNextApplierBatch_inlock(lock, onCompletionGuard); - return; - } - } - - // Oplog at sync source has not advanced since we started cloning databases, so we use the last - // oplog entry to seed the oplog before checking the rollback ID. - { - const auto& documents = result.getValue().documents; - invariant(!documents.empty()); - const BSONObj oplogSeedDoc = documents.front(); - LOGV2_DEBUG(21185, - 2, - "Inserting oplog seed document: {oplogSeedDocument}", - "Inserting oplog seed document", - "oplogSeedDocument"_attr = oplogSeedDoc); - - auto opCtx = makeOpCtx(); - // StorageInterface::insertDocument() has to be called outside the lock because we may - // override its behavior in tests. See InitialSyncerReturnsCallbackCanceledAndDoesNot- - // ScheduleRollbackCheckerIfShutdownAfterInsertingInsertOplogSeedDocument in - // initial_syncer_test.cpp - // - // Note that the initial seed oplog insertion is not timestamped, this is safe to do as the - // logic for navigating the oplog is reliant on the timestamp value of the oplog document - // itself. Additionally, this also prevents confusion in the storage engine as the last - // insertion can be produced at precisely the stable timestamp, which could lead to invalid - // data consistency due to the stable timestamp signalling that no operations before or at - // that point will be rolled back. So transactions shouldn't happen at precisely that point. - auto status = _storage->insertDocument(opCtx.get(), - NamespaceString::kRsOplogNamespace, - TimestampedBSONObj{oplogSeedDoc}, - resultOpTimeAndWallTime.opTime.getTerm()); - if (!status.isOK()) { - stdx::lock_guard lock(_mutex); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - const bool orderedCommit = true; - _storage->oplogDiskLocRegister( - opCtx.get(), resultOpTimeAndWallTime.opTime.getTimestamp(), orderedCommit); - } - - stdx::lock_guard lock(_mutex); - _lastApplied = resultOpTimeAndWallTime; - LOGV2(21186, - "No need to apply operations. (currently at {stopTimestamp})", - "No need to apply operations", - "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON()); - - // This sets the error in 'onCompletionGuard' and shuts down the OplogFetcher on error. - _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); -} - -void InitialSyncerFCB::_getNextApplierBatchCallback( - const executor::TaskExecutor::CallbackArgs& callbackArgs, - std::shared_ptr onCompletionGuard) noexcept try { - stdx::lock_guard lock(_mutex); - auto status = - _checkForShutdownAndConvertStatus_inlock(callbackArgs, "error getting next applier batch"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - auto batchResult = _getNextApplierBatch_inlock(); - if (!batchResult.isOK()) { - LOGV2_WARNING(21196, - "Failure creating next apply batch: {error}", - "Failure creating next apply batch", - "error"_attr = redact(batchResult.getStatus())); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, batchResult.getStatus()); - return; - } - - std::string logMsg = str::stream() - << "Initial Syncer is about to apply the next oplog batch of size: " - << batchResult.getValue().size(); - pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); - - if (MONGO_unlikely(failInitialSyncBeforeApplyingBatchFCB.shouldFail())) { - LOGV2(21187, - "initial sync - failInitialSyncBeforeApplyingBatchFCB fail point enabled. Pausing " - "until " - "fail point is disabled, then will fail initial sync"); - failInitialSyncBeforeApplyingBatchFCB.pauseWhileSet(); - status = Status(ErrorCodes::CallbackCanceled, - "failInitialSyncBeforeApplyingBatchFCB fail point enabled"); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // Schedule MultiApplier if we have operations to apply. - const auto& ops = batchResult.getValue(); - if (!ops.empty()) { - _fetchCount.store(0); - MultiApplier::MultiApplyFn applyBatchOfOperationsFn = [this](OperationContext* opCtx, - std::vector ops) { - return _oplogApplier->applyOplogBatch(opCtx, std::move(ops)); - }; - OpTime lastApplied = ops.back().getOpTime(); - Date_t lastAppliedWall = ops.back().getWallClockTime(); - - auto numApplied = ops.size(); - MultiApplier::CallbackFn onCompletionFn = [=](const Status& s) { - return _multiApplierCallback( - s, {lastApplied, lastAppliedWall}, numApplied, onCompletionGuard); - }; - - _applier = std::make_unique( - *_attemptExec, ops, std::move(applyBatchOfOperationsFn), std::move(onCompletionFn)); - status = _startupComponent_inlock(_applier); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - return; - } - - // If the oplog fetcher is no longer running (completed successfully) and the oplog buffer is - // empty, we are not going to make any more progress with this initial sync. Report progress so - // far and return a RemoteResultsUnavailable error. - if (!_oplogFetcher->isActive()) { - static constexpr char msg[] = - "The oplog fetcher is no longer running and we have applied all the oplog entries " - "in the oplog buffer. Aborting this initial sync attempt"; - LOGV2(21188, - msg, - "lastApplied"_attr = _lastApplied.opTime, - "lastFetched"_attr = _lastFetched, - "operationsApplied"_attr = _initialSyncState->appliedOps); - status = Status(ErrorCodes::RemoteResultsUnavailable, - str::stream() - << msg << ". Last applied: " << _lastApplied.opTime.toString() - << ". Last fetched: " << _lastFetched.toString() - << ". Number of operations applied: " << _initialSyncState->appliedOps); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // If there are no operations at the moment to apply and the oplog fetcher is still waiting on - // the sync source, we'll check the oplog buffer again in - // '_opts.getApplierBatchCallbackRetryWait' ms. - auto when = (*_attemptExec)->now() + _opts.getApplierBatchCallbackRetryWait; - status = _scheduleWorkAtAndSaveHandle_inlock( - when, - [=](const CallbackArgs& args) { _getNextApplierBatchCallback(args, onCompletionGuard); }, - &_getNextApplierBatchHandle, - "_getNextApplierBatchCallback"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} catch (const DBException&) { - // Report exception as an initial syncer failure. - stdx::unique_lock lock(_mutex); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); -} - -void InitialSyncerFCB::_multiApplierCallback(const Status& multiApplierStatus, - OpTimeAndWallTime lastApplied, - std::uint32_t numApplied, - std::shared_ptr onCompletionGuard) { - stdx::lock_guard lock(_mutex); - auto status = - _checkForShutdownAndConvertStatus_inlock(multiApplierStatus, "error applying batch"); - - // Set to cause initial sync to fassert instead of restart if applying a batch fails, so that - // tests can be robust to network errors but not oplog idempotency errors. - if (MONGO_unlikely(initialSyncFassertIfApplyingBatchFailsFCB.shouldFail())) { - LOGV2(21189, "initialSyncFassertIfApplyingBatchFailsFCB fail point enabled"); - fassert(31210, status); - } - - if (!status.isOK()) { - LOGV2_ERROR(21199, - "Failed to apply batch due to '{error}'", - "Failed to apply batch", - "error"_attr = redact(status)); - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - _initialSyncState->appliedOps += numApplied; - _lastApplied = lastApplied; - const auto lastAppliedOpTime = _lastApplied.opTime; - _opts.setMyLastOptime(_lastApplied); - - // Update oplog visibility after applying a batch so that while applying transaction oplog - // entries, the TransactionHistoryIterator can get earlier oplog entries associated with the - // transaction. Note that setting the oplog visibility timestamp here will be safe even if - // initial sync was restarted because until initial sync ends, no one else will try to read our - // oplog. It is also safe even if we tried to read from our own oplog because we never try to - // read from the oplog before applying at least one batch and therefore setting a value for the - // oplog visibility timestamp. - auto opCtx = makeOpCtx(); - const bool orderedCommit = true; - _storage->oplogDiskLocRegister(opCtx.get(), lastAppliedOpTime.getTimestamp(), orderedCommit); - _checkApplierProgressAndScheduleGetNextApplierBatch_inlock(lock, onCompletionGuard); -} - -void InitialSyncerFCB::_rollbackCheckerCheckForRollbackCallback( - const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard) { - stdx::lock_guard lock(_mutex); - auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), - "error while getting last rollback ID"); - if (_shouldRetryError(lock, status)) { - LOGV2_DEBUG(21190, - 1, - "Retrying rollback checker because of network error {error}", - "Retrying rollback checker because of network error", - "error"_attr = status); - _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); - return; - } - - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - auto hasHadRollback = result.getValue(); - if (hasHadRollback) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - Status(ErrorCodes::UnrecoverableRollbackError, - str::stream() << "Rollback occurred on our sync source " << _syncSource - << " during initial sync")); - return; - } - - if (MONGO_unlikely(initialSyncHangBeforeCompletingOplogFetchingFCB.shouldFail())) { - LOGV2(4599500, "initialSyncHangBeforeCompletingOplogFetchingFCB fail point enabled"); - initialSyncHangBeforeCompletingOplogFetchingFCB.pauseWhileSet(); - } - - // Success! - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, _lastApplied); } void InitialSyncerFCB::_finishInitialSyncAttempt(const StatusWith& lastApplied) { @@ -1901,129 +1064,6 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied } } -Status InitialSyncerFCB::_scheduleLastOplogEntryFetcher_inlock( - Fetcher::CallbackFn callback, LastOplogEntryFetcherRetryStrategy retryStrategy) { - BSONObj query = - BSON("find" << NamespaceString::kRsOplogNamespace.coll() << "sort" << BSON("$natural" << -1) - << "limit" << 1 << ReadConcernArgs::kReadConcernFieldName - << ReadConcernArgs::kLocal); - - _lastOplogEntryFetcher = std::make_unique( - *_attemptExec, - _syncSource, - NamespaceString::kRsOplogNamespace.db().toString(), - query, - callback, - ReadPreferenceSetting::secondaryPreferredMetadata(), - RemoteCommandRequest::kNoTimeout /* find network timeout */, - RemoteCommandRequest::kNoTimeout /* getMore network timeout */, - (retryStrategy == kFetcherHandlesRetries) - ? RemoteCommandRetryScheduler::makeRetryPolicy( - numInitialSyncOplogFindAttempts.load(), - executor::RemoteCommandRequest::kNoTimeout) - : RemoteCommandRetryScheduler::makeNoRetryPolicy()); - Status scheduleStatus = _lastOplogEntryFetcher->schedule(); - if (!scheduleStatus.isOK()) { - _lastOplogEntryFetcher.reset(); - } - - return scheduleStatus; -} - -void InitialSyncerFCB::_checkApplierProgressAndScheduleGetNextApplierBatch_inlock( - const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard) { - // We should check our current state because shutdown() could have been called before - // we re-acquired the lock. - if (_isShuttingDown_inlock()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - Status(ErrorCodes::CallbackCanceled, - "failed to schedule applier to check for " - "rollback: initial syncer is shutting down")); - return; - } - - // Basic sanity check on begin/stop timestamps. - if (_initialSyncState->beginApplyingTimestamp > _initialSyncState->stopTimestamp) { - static constexpr char msg[] = "Possible rollback on sync source"; - LOGV2_ERROR(21201, - msg, - "syncSource"_attr = _syncSource, - "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON(), - "beginApplyingTimestamp"_attr = - _initialSyncState->beginApplyingTimestamp.toBSON()); - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - Status(ErrorCodes::OplogOutOfOrder, - str::stream() << msg << " " << _syncSource.toString() << ". Currently at " - << _initialSyncState->stopTimestamp.toBSON() << ". Started at " - << _initialSyncState->beginApplyingTimestamp.toBSON())); - return; - } - - if (_lastApplied.opTime.isNull()) { - // Check if any ops occurred while cloning or any ops need to be fetched. - invariant(_initialSyncState->beginFetchingTimestamp < _initialSyncState->stopTimestamp); - LOGV2(21195, - "Writing to the oplog and applying operations until {stopTimestamp} " - "before initial sync can complete. (started fetching at " - "{beginFetchingTimestamp} and applying at " - "{beginApplyingTimestamp})", - "Writing to the oplog and applying operations until stopTimestamp before initial " - "sync can complete", - "stopTimestamp"_attr = _initialSyncState->stopTimestamp.toBSON(), - "beginFetchingTimestamp"_attr = _initialSyncState->beginFetchingTimestamp.toBSON(), - "beginApplyingTimestamp"_attr = _initialSyncState->beginApplyingTimestamp.toBSON()); - // Fall through to scheduling _getNextApplierBatchCallback(). - } else if (_lastApplied.opTime.getTimestamp() >= _initialSyncState->stopTimestamp) { - // Check for rollback if we have applied far enough to be consistent. - invariant(!_lastApplied.opTime.getTimestamp().isNull()); - _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); - return; - } - - // Get another batch to apply. - // _scheduleWorkAndSaveHandle_inlock() is shutdown-aware. - auto status = _scheduleWorkAndSaveHandle_inlock( - [=](const executor::TaskExecutor::CallbackArgs& args) { - return _getNextApplierBatchCallback(args, onCompletionGuard); - }, - &_getNextApplierBatchHandle, - "_getNextApplierBatchCallback"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } -} - -void InitialSyncerFCB::_scheduleRollbackCheckerCheckForRollback_inlock( - const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard) { - // We should check our current state because shutdown() could have been called before - // we re-acquired the lock. - if (_isShuttingDown_inlock()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - Status(ErrorCodes::CallbackCanceled, - "failed to schedule rollback checker to check " - "for rollback: initial syncer is shutting " - "down")); - return; - } - - auto scheduleResult = - _rollbackChecker->checkForRollback([=](const RollbackChecker::Result& result) { - _rollbackCheckerCheckForRollbackCallback(result, onCompletionGuard); - }); - - auto status = scheduleResult.getStatus(); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - _getLastRollbackIdHandle = scheduleResult.getValue(); -} - bool InitialSyncerFCB::_shouldRetryError(WithLock lk, Status status) { if (ErrorCodes::isRetriableError(status)) { stdx::lock_guard sharedDataLock(*_sharedData); @@ -2129,27 +1169,6 @@ void InitialSyncerFCB::_shutdownComponent_inlock(Component& component) { component->shutdown(); } -StatusWith> InitialSyncerFCB::_getNextApplierBatch_inlock() { - // If the fail-point is active, delay the apply batch by returning an empty batch so that - // _getNextApplierBatchCallback() will reschedule itself at a later time. - // See InitialSyncerInterface::Options::getApplierBatchCallbackRetryWait. - if (MONGO_unlikely(rsSyncApplyStopFCB.shouldFail())) { - return std::vector(); - } - - // Obtain next batch of operations from OplogApplier. - auto opCtx = makeOpCtx(); - OplogApplier::BatchLimits batchLimits; - batchLimits.bytes = replBatchLimitBytes.load(); - batchLimits.ops = getBatchLimitOplogEntries(); - // We want a batch boundary after the beginApplyingTimestamp, to make sure all oplog entries - // that are part of a transaction before that timestamp are written out before we start applying - // entries after them. This is because later entries may be commit or prepare and thus - // expect to read the partial entries from the oplog. - batchLimits.forceBatchBoundaryAfter = _initialSyncState->beginApplyingTimestamp; - return _oplogApplier->getNextApplierBatch(opCtx.get(), batchLimits); -} - StatusWith InitialSyncerFCB::_chooseSyncSource_inlock() { auto syncSource = _opts.syncSourceSelector->chooseNewSyncSource(_lastFetched); if (syncSource.empty()) { @@ -2160,31 +1179,6 @@ StatusWith InitialSyncerFCB::_chooseSyncSource_inlock() { return syncSource; } -Status InitialSyncerFCB::_enqueueDocuments(OplogFetcher::Documents::const_iterator begin, - OplogFetcher::Documents::const_iterator end, - const OplogFetcher::DocumentsInfo& info) { - if (info.toApplyDocumentCount == 0) { - return Status::OK(); - } - - if (_isShuttingDown()) { - return Status::OK(); - } - - invariant(_oplogBuffer); - - // Wait for enough space. - _oplogApplier->waitForSpace(makeOpCtx().get(), info.toApplyDocumentBytes); - - // Buffer docs for later application. - _oplogApplier->enqueue(makeOpCtx().get(), begin, end); - - _lastFetched = info.lastDocument; - - // TODO: updates metrics with "info". - return Status::OK(); -} - std::string InitialSyncerFCB::Stats::toString() const { return toBSON().toString(); } @@ -2242,23 +1236,5 @@ void InitialSyncerFCB::InitialSyncAttemptInfo::append(BSONObjBuilder* builder) c builder->append("totalTimeUnreachableMillis", totalTimeUnreachableMillis); } -bool InitialSyncerFCB::OplogFetcherRestartDecisionInitialSyncer::shouldContinue( - OplogFetcher* fetcher, Status status) { - if (ErrorCodes::isRetriableError(status)) { - stdx::lock_guard lk(*_sharedData); - return _sharedData->shouldRetryOperation(lk, &_retryingOperation); - } - // A non-network error occured, so clear any network error and use the default restart - // strategy. - _retryingOperation = boost::none; - return _defaultDecision.shouldContinue(fetcher, status); -} - -void InitialSyncerFCB::OplogFetcherRestartDecisionInitialSyncer::fetchSuccessful( - OplogFetcher* fetcher) { - _retryingOperation = boost::none; - _defaultDecision.fetchSuccessful(fetcher); -} - } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 54f271fcf106c..e7391a59cc955 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -32,7 +32,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #pragma once -#include #include #include #include @@ -50,10 +49,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/initial_sync_shared_data.h" #include "mongo/db/repl/initial_syncer_interface.h" #include "mongo/db/repl/multiapplier.h" -#include "mongo/db/repl/oplog_applier.h" -#include "mongo/db/repl/oplog_buffer.h" -#include "mongo/db/repl/oplog_entry.h" -#include "mongo/db/repl/oplog_fetcher.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/rollback_checker.h" #include "mongo/executor/scoped_task_executor.h" @@ -61,27 +56,15 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/platform/atomic_word.h" #include "mongo/platform/mutex.h" #include "mongo/stdx/condition_variable.h" -#include "mongo/stdx/mutex.h" #include "mongo/util/concurrency/thread_pool.h" #include "mongo/util/concurrency/with_lock.h" #include "mongo/util/duration.h" -#include "mongo/util/fail_point.h" #include "mongo/util/net/hostandport.h" #include "mongo/util/time_support.h" namespace mongo { namespace repl { -// TODO: Remove forward declares once we remove rs_initialsync.cpp and other dependents. -// Failpoint which fails initial sync and leaves an oplog entry in the buffer. -extern FailPoint failInitSyncWithBufferedEntriesLeftFCB; - -// Failpoint which causes the initial sync function to hang before copying databases. -extern FailPoint initialSyncHangBeforeCopyingDatabasesFCB; - -// Failpoint which stops the applier. -extern FailPoint rsSyncApplyStopFCB; - struct InitialSyncState; class ReplicationProcess; class StorageInterface; @@ -122,28 +105,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { void append(BSONObjBuilder* builder) const; }; - class OplogFetcherRestartDecisionInitialSyncer - : public OplogFetcher::OplogFetcherRestartDecision { - - public: - OplogFetcherRestartDecisionInitialSyncer(InitialSyncSharedData* sharedData, - std::size_t maxFetcherRestarts) - : _sharedData(sharedData), _defaultDecision(maxFetcherRestarts){}; - - bool shouldContinue(OplogFetcher* fetcher, Status status) final; - - void fetchSuccessful(OplogFetcher* fetcher) final; - - private: - InitialSyncSharedData* _sharedData; - - // We delegate to the default strategy when it's a non-network error. - OplogFetcher::OplogFetcherRestartDecisionDefault _defaultDecision; - - // The operation, if any, currently being retried because of a network error. - InitialSyncSharedData::RetryableOperation _retryingOperation; - }; - struct Stats { std::uint32_t failedInitialSyncAttempts{0}; std::uint32_t maxFailedInitialSyncAttempts{0}; @@ -200,24 +161,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { */ void setCreateClientFn_forTest(const CreateClientFn& createClientFn); - /** - * - * Overrides how the initial syncer creates the OplogFetcher. - * - * For testing only. - */ - void setCreateOplogFetcherFn_forTest(std::unique_ptr createOplogFetcherFn); - - /** - * - * Get a raw pointer to the OplogFetcher. Block up to 10s until the underlying OplogFetcher has - * started. It is the caller's responsibility to not reuse this pointer beyond the lifetime of - * the underlying OplogFetcher. - * - * For testing only. - */ - OplogFetcher* getOplogFetcher_forTest() const; - /** * * Provides a separate executor for the cloners, so network operations based on @@ -261,11 +204,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { void setAllowedOutageDuration_forTest(Milliseconds allowedOutageDuration); private: - enum LastOplogEntryFetcherRetryStrategy { - kFetcherHandlesRetries, - kInitialSyncerHandlesRetries - }; - /** * Returns true if we are still processing initial sync tasks (_state is either Running or * Shutdown). @@ -287,7 +225,7 @@ class InitialSyncerFCB : public InitialSyncerInterface { /** * Initial sync flowchart: * - * start() + * startup() * | * | * V @@ -318,15 +256,15 @@ class InitialSyncerFCB : public InitialSyncerInterface { * | * | * V - * _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime() + * _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime() [removed] * | * | * V - * _getBeginFetchingOpTimeCallback() + * _getBeginFetchingOpTimeCallback() [removed] * | * | * V - * _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp() + * _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp() [removed] * | * | * V @@ -337,29 +275,29 @@ class InitialSyncerFCB : public InitialSyncerInterface { * | | * | | * V V - * _oplogFetcherCallback() _allDatabaseClonerCallback + * _oplogFetcherCallback[removed] _allDatabaseClonerCallback [removed] * | | * | | * | V - * | _lastOplogEntryFetcherCallbackForStopTimestamp() + * | _lastOplogEntryFetcherCallbackForStopTimestamp() [removed] * | | | * | | | * | (no ops to apply) | | (have ops to apply) * | | | * | | V - * | | _getNextApplierBatchCallback() + * | | _getNextApplierBatchCallback() [removed] * | | | ^ * | | | | * | | | (end ts not reached) * | | | | * | | V | - * | | _multiApplierCallback()-----+ + * | | _multiApplierCallback()-----+ [removed] * | | | * | | | * | (reached end timestamp) * | | | * | V V - * | _rollbackCheckerCheckForRollbackCallback() + * | _rollbackCheckerCheckForRollbackCallback() [removed] * | | * | | * +------------------------------+ @@ -415,44 +353,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { void _rollbackCheckerResetCallback(const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard); - /** - * Callback for first '_lastOplogEntryFetcher' callback. A successful response lets us - * determine the default starting point for tailing the oplog using the OplogFetcher if there - * are no active transactions on the sync source. This will be used as the default for the - * beginFetchingTimestamp. - */ - void _lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime( - const StatusWith& result, - std::shared_ptr onCompletionGuard); - - /** - * Schedules a remote command to issue a find command on sync source's transaction table, which - * will get us the optime of the oldest active transaction on that node. It will be used as the - * beginFetchingTimestamp. - */ - Status _scheduleGetBeginFetchingOpTime_inlock( - std::shared_ptr onCompletionGuard, - const OpTime& defaultBeginFetchingOpTime); - - /** - * Callback that gets the optime of the oldest active transaction in the sync source's - * transaction table. It will be used as the beginFetchingTimestamp. - */ - void _getBeginFetchingOpTimeCallback(const StatusWith& result, - std::shared_ptr onCompletionGuard, - const OpTime& defaultBeginFetchingOpTime); - - /** - * Callback for second '_lastOplogEntryFetcher' callback. A successful response lets us - * determine the starting point for applying oplog entries during the oplog application phase - * as well as setting a reference point for the state of the sync source's oplog when data - * cloning completes. - */ - void _lastOplogEntryFetcherCallbackForBeginApplyingTimestamp( - const StatusWith& result, - std::shared_ptr onCompletionGuard, - OpTime& beginFetchingOpTime); - /** * Callback for the '_fCVFetcher'. A successful response lets us check if the remote node * is in a currently acceptable fCV and if it has a 'targetVersion' set. @@ -462,50 +362,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { const OpTime& lastOpTime, OpTime& beginFetchingOpTime); - /** - * Callback for oplog fetcher. - */ - void _oplogFetcherCallback(const Status& status, - std::shared_ptr onCompletionGuard); - - /** - * Callback for DatabasesCloner. - */ - void _allDatabaseClonerCallback(const Status& status, - std::shared_ptr onCompletionGuard); - - /** - * Callback for second '_lastOplogEntryFetcher' callback. This is scheduled to obtain the stop - * timestamp after DatabasesCloner has completed and enables us to determine if the oplog on - * the sync source has advanced since we started cloning the databases. - */ - void _lastOplogEntryFetcherCallbackForStopTimestamp( - const StatusWith& result, - std::shared_ptr onCompletionGuard); - - /** - * Callback to obtain next batch of operations to apply. - */ - void _getNextApplierBatchCallback( - const executor::TaskExecutor::CallbackArgs& callbackArgs, - std::shared_ptr onCompletionGuard) noexcept; - - /** - * Callback for MultiApplier completion. - */ - void _multiApplierCallback(const Status& status, - OpTimeAndWallTime lastApplied, - std::uint32_t numApplied, - std::shared_ptr onCompletionGuard); - - /** - * Callback for rollback checker's last replSetGetRBID command after cloning data and applying - * operations. - */ - void _rollbackCheckerCheckForRollbackCallback( - const RollbackChecker::Result& result, - std::shared_ptr onCompletionGuard); - /** * Reports result of current initial sync attempt. May schedule another initial sync attempt * depending on shutdown state and whether we've exhausted all initial sync retries. @@ -521,52 +377,9 @@ class InitialSyncerFCB : public InitialSyncerInterface { // Returns error if a sync source cannot be found. StatusWith _chooseSyncSource_inlock(); - /** - * Pushes documents from oplog fetcher to blocking queue for - * applier to consume. - * - * Returns a status even though it always returns OK, to conform the interface OplogFetcher - * expects for the EnqueueDocumentsFn. - */ - Status _enqueueDocuments(OplogFetcher::Documents::const_iterator begin, - OplogFetcher::Documents::const_iterator end, - const OplogFetcher::DocumentsInfo& info); - void _appendInitialSyncProgressMinimal_inlock(BSONObjBuilder* bob) const; BSONObj _getInitialSyncProgress_inlock() const; - StatusWith> _getNextApplierBatch_inlock(); - - /** - * Schedules a fetcher to get the last oplog entry from the sync source. - * - * If 'retryStrategy' is 'kFetcherHandlesRetries', the fetcher will retry up to the server - * parameter 'numInitialSyncOplogFindAttempts' times. Otherwise any failures must be handled by - * the caller. - */ - Status _scheduleLastOplogEntryFetcher_inlock(Fetcher::CallbackFn callback, - LastOplogEntryFetcherRetryStrategy retryStrategy); - - /** - * Checks the current oplog application progress (begin and end timestamps). - * If necessary, schedules a _getNextApplierBatchCallback() task. - * If the stop and end timestamps are inconsistent or if there is an issue scheduling the task, - * we set the error status in 'onCompletionGuard' and shut down the OplogFetcher. - * Passes 'lock' through to completion guard. - */ - void _checkApplierProgressAndScheduleGetNextApplierBatch_inlock( - const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard); - - /** - * Schedules a rollback checker to get the rollback ID after data cloning or applying. This - * helps us check if a rollback occurred on the sync source. - * If we fail to schedule the rollback checker, we set the error status in 'onCompletionGuard' - * and shut down the OplogFetcher. - * Passes 'lock' through to completion guard. - */ - void _scheduleRollbackCheckerCheckForRollback_inlock( - const stdx::lock_guard& lock, std::shared_ptr onCompletionGuard); - /** * Check if a status is one which means there's a retriable error and we should retry the * current operation, and records whether an operation is currently being retried. Note this @@ -669,22 +482,14 @@ class InitialSyncerFCB : public InitialSyncerInterface { // RollbackChecker to get rollback ID before and after each initial sync attempt. std::unique_ptr _rollbackChecker; // (M) - // Handle returned from RollbackChecker::reset(). - RollbackChecker::CallbackHandle _getBaseRollbackIdHandle; // (M) - // Handle returned from RollbackChecker::checkForRollback(). RollbackChecker::CallbackHandle _getLastRollbackIdHandle; // (M) - // Handle to currently scheduled _getNextApplierBatchCallback() task. - executor::TaskExecutor::CallbackHandle _getNextApplierBatchHandle; // (M) - // The operation, if any, currently being retried because of a network error. InitialSyncSharedData::RetryableOperation _retryingOperation; // (M) std::unique_ptr _initialSyncState; // (M) - std::unique_ptr _oplogFetcher; // (S) std::unique_ptr _beginFetchingOpTimeFetcher; // (S) - std::unique_ptr _lastOplogEntryFetcher; // (S) std::unique_ptr _fCVFetcher; // (S) std::unique_ptr _applier; // (M) HostAndPort _syncSource; // (M) @@ -692,9 +497,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { OpTime _lastFetched; // (MX) OpTimeAndWallTime _lastApplied; // (MX) - std::unique_ptr _oplogBuffer; // (M) - std::unique_ptr _oplogApplier; // (M) - // Used to signal changes in _state. mutable stdx::condition_variable _stateCondition; @@ -704,9 +506,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { // Used to create the DBClientConnection for the cloners CreateClientFn _createClientFn; - // Used to create the OplogFetcher for the InitialSyncerFCB. - std::unique_ptr _createOplogFetcherFn; - // Contains stats on the current initial sync request (includes all attempts). // To access these stats in a user-readable format, use getInitialSyncProgress(). Stats _stats; // (M) From a5903c4caf7b65800b57c5bacba770745c4496bc Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Tue, 28 May 2024 16:54:40 +0100 Subject: [PATCH 06/32] PSMDB-1284 get list of local files via $backupCursor --- src/mongo/db/repl/SConscript | 1 + src/mongo/db/repl/initial_syncer_fcb.cpp | 169 +++++++++++++++++++++-- src/mongo/db/repl/initial_syncer_fcb.h | 39 ++---- 3 files changed, 169 insertions(+), 40 deletions(-) diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index dd0e56c9cc2d8..699dc9df13c85 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1279,6 +1279,7 @@ env.Library( '$BUILD_DIR/mongo/db/server_base', '$BUILD_DIR/mongo/db/serverless/serverless_lock', '$BUILD_DIR/mongo/db/session/session_catalog_mongod', + '$BUILD_DIR/mongo/db/storage/storage_engine_common', '$BUILD_DIR/mongo/executor/scoped_task_executor', 'repl_server_parameters', ], diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index fe3aeace5751e..9202c33dc2c48 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -31,6 +31,10 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "initial_syncer_fcb.h" +#include "mongo/client/dbclient_cursor.h" +#include "mongo/db/dbdirectclient.h" +#include "mongo/db/pipeline/aggregate_command_gen.h" +#include "mongo/db/storage/storage_options.h" #include #include #include @@ -38,6 +42,10 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include #include +#include +#include +#include + #include "mongo/base/error_codes.h" #include "mongo/base/status.h" #include "mongo/bson/bsonmisc.h" @@ -45,6 +53,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/bson/timestamp.h" #include "mongo/client/fetcher.h" #include "mongo/db/client.h" +#include "mongo/db/database_name.h" #include "mongo/db/feature_compatibility_version_parser.h" #include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" @@ -62,6 +71,8 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/tenant_migration_access_blocker_util.h" #include "mongo/db/repl/transaction_oplog_application.h" #include "mongo/db/serverless/serverless_operation_lock_registry.h" +#include "mongo/db/storage/storage_engine.h" +#include "mongo/db/storage/storage_engine_init.h" #include "mongo/executor/task_executor.h" #include "mongo/logv2/log.h" #include "mongo/platform/compiler.h" // IWYU pragma: keep @@ -434,20 +445,6 @@ BSONObj InitialSyncerFCB::_getInitialSyncProgress_inlock() const { return bob.obj(); } -void InitialSyncerFCB::setCreateClientFn_forTest(const CreateClientFn& createClientFn) { - LockGuard lk(_mutex); - _createClientFn = createClientFn; -} - -void InitialSyncerFCB::setClonerExecutor_forTest( - std::shared_ptr clonerExec) { - _clonerExec = std::move(clonerExec); -} - -void InitialSyncerFCB::waitForCloner_forTest() { - _initialSyncState->allDatabaseClonerFuture.wait(); -} - void InitialSyncerFCB::_setUp_inlock(OperationContext* opCtx, std::uint32_t initialSyncMaxAttempts) { // 'opCtx' is passed through from startup(). @@ -680,6 +677,18 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( _syncSource = syncSource.getValue(); + LOGV2_DEBUG(128404, 2, "Reading the list of local files via $backupCUrsor"); + auto bfiles = _getBackupFiles(); + if (!bfiles.isOK()) { + LOGV2_DEBUG( + 128405, 2, "Failed to get the list of local files", "status"_attr = bfiles.getStatus()); + } + LOGV2_DEBUG( + 128406, 2, "Retrieved names of local files", "number"_attr = bfiles.getValue().size()); + // TODO: this is temporary cancelation of initial syn for debugging reasons + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, {ErrorCodes::NotImplemented, "cancel FCBIS for debugging reason"}); + } catch (const DBException&) { // Report exception as an initial syncer failure. stdx::unique_lock lock(_mutex); @@ -1179,6 +1188,138 @@ StatusWith InitialSyncerFCB::_chooseSyncSource_inlock() { return syncSource; } +namespace { + +void moveFile(const std::string& src, const std::string& dst) { + LOGV2_DEBUG(128401, 1, "Moving file", "from"_attr = src, "to"_attr = dst); + + uassert(128402, + "Destination file '{}' already exists"_format(dst), + !boost::filesystem::exists(dst)); + + // Boost filesystem functions clear "ec" on success. + boost::system::error_code ec; + boost::filesystem::rename(src, dst, ec); + if (ec) { + uasserted(128403, + "Error copying file from '{}' to '{}': {}"_format(src, dst, ec.message())); + } +} + +BSONObj makeBackupCursorCmd() { + BSONArrayBuilder pipelineBuilder; + pipelineBuilder << BSON("$backupCursor" << BSONObj()); + return BSON("aggregate" << 1 << "pipeline" << pipelineBuilder.arr() << "cursor" << BSONObj()); +} + +AggregateCommandRequest makeBackupCursorRequest() { + return {NamespaceString::makeCollectionlessAggregateNSS(DatabaseName::kAdmin), + {BSON("$backupCursor" << BSONObj())}}; +} + +} // namespace + +// function to move list of files from one directory to another +Status InitialSyncerFCB::_moveFiles(const std::vector& files, + const std::string& sourceDir, + const std::string& destDir) { + for (const auto& file : files) { + auto sourcePath = sourceDir + "/" + file; + auto destPath = destDir + "/" + file; + moveFile(sourcePath, destPath); + } + return Status::OK(); +} + +// Open a local backup cursor and obtain a list of files from that. +StatusWith> InitialSyncerFCB::_getBackupFiles() { + std::vector files; + try { + // Open a local backup cursor and obtain a list of files from that. + // TODO: ensure _attemptExec usage is correct + //_client->getServerHostAndPort(); + + // Try to use DBDirectClient + auto opCtx = makeOpCtx(); + DBDirectClient client(opCtx.get()); + auto cursor = uassertStatusOK(DBClientCursor::fromAggregationRequest( + &client, makeBackupCursorRequest(), true /* secondaryOk */, false /* useExhaust */)); + if (cursor->more()) { + auto metadata = cursor->next(); + // TODO: remove all logd() calls + logd("isoldbg: $backupCursor metadata: {}", metadata.toString()); + files.reserve(cursor->objsLeftInBatch()); + } + while (cursor->more()) { + auto rec = cursor->next(); + logd("isoldbg: {}", rec.toString()); + files.emplace_back(rec["filename"_sd].String()); + } + + // BSONObj result; + // if (client.runCommand(DatabaseName::kAdmin, makeBackupCursorCmd(), result)) { + // logd("isoldbg: $backupCursor result: {}", result.toString()); + // } else { + // logd("isoldbg: runCommand failed: {}", result.toString()); + // return Status{ErrorCodes::InternalError, "Local $backupCursor failed"}; + // } + + // Use fetcher to run aggregation on sync source + // Fetcher fetcher(_attemptExec.get(), + // host, + // aggRequest.getNamespace().db().toString(), + // aggregation_request_helper::serializeToCommandObj(aggRequest), + // fetcherCallback, + // readPrefMetadata, + // requestTimeout, /* command network timeout */ + // requestTimeout /* getMore network timeout */); + + // Status scheduleStatus = fetcher.schedule(); + // if (!scheduleStatus.isOK()) { + // return scheduleStatus; + // } + + // Status joinStatus = fetcher.join(opCtx); + // if (!joinStatus.isOK()) { + // return joinStatus; + // } + } catch (const DBException& e) { + return e.toStatus(); + } + return files; +} + +// Switch storage location +Status InitialSyncerFCB::_switchStorageLocation(const std::string& newLocation) { + auto opCtx = makeOpCtx(); + auto lastShutdownState = + reinitializeStorageEngine(opCtx.get(), StorageEngineInitFlags{}, [&newLocation] { + storageGlobalParams.dbpath = newLocation; + }); + if (StorageEngine::LastShutdownState::kClean != lastShutdownState) { + return {ErrorCodes::InternalError, + str::stream() << "Failed to switch storage location to " << newLocation}; + } + return Status::OK(); +} + +void InitialSyncerFCB::_fcbisDraft() { + // Switch storage to be pointing to the set of downloaded files + _switchStorageLocation(storageGlobalParams.dbpath + ".initialsync"); + // do some cleanup + // TODO: + // Switch storage to a dummy location + _switchStorageLocation(storageGlobalParams.dbpath + ".dummy"); + // Delete the list of files obtained from the local backup cursor + // TODO: + // Move the files from the download location to the normal dbpath + //_moveFiles(files, storageGlobalParams.dbpath + ".initialsync", storageGlobalParams.dbpath); + // Switch storage back to the normal dbpath + _switchStorageLocation(storageGlobalParams.dbpath); + // Reconstruct prepared transactions and other ephemera + // TODO: +} + std::string InitialSyncerFCB::Stats::toString() const { return toBSON().toString(); } diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index e7391a59cc955..9ab3fd7e497d9 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -153,32 +153,6 @@ class InitialSyncerFCB : public InitialSyncerInterface { void cancelCurrentAttempt() final; - /** - * - * Overrides how the initial syncer creates the client. - * - * For testing only - */ - void setCreateClientFn_forTest(const CreateClientFn& createClientFn); - - /** - * - * Provides a separate executor for the cloners, so network operations based on - * TaskExecutor::scheduleRemoteCommand() can use the NetworkInterfaceMock while the cloners - * are stopped on a failpoint. - * - * For testing only - */ - void setClonerExecutor_forTest(std::shared_ptr clonerExec); - - /** - * - * Wait for the cloner thread to finish. - * - * For testing only - */ - void waitForCloner_forTest(); - // State transitions: // PreStart --> Running --> ShuttingDown --> Complete // It is possible to skip intermediate states. For example, calling shutdown() when the data @@ -437,6 +411,19 @@ class InitialSyncerFCB : public InitialSyncerInterface { template void _shutdownComponent_inlock(Component& component); + /** + * Temporary location to declare all FCB-related private methods + */ + Status _moveFiles(const std::vector& files, + const std::string& sourceDir, + const std::string& destDir); + + StatusWith> _getBackupFiles(); + + Status _switchStorageLocation(const std::string& newLocation); + + void _fcbisDraft(); + // Counts how many documents have been refetched from the source in the current batch. AtomicWord _fetchCount; From d86f9e73e9bdb1e379ece94f06a2a91865d0098a Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 31 May 2024 01:44:04 +0100 Subject: [PATCH 07/32] PSMDB-1284 Invoke $backupCursor on the sync source --- src/mongo/db/repl/initial_syncer_fcb.cpp | 127 +++++++++++++++++++++-- src/mongo/db/repl/initial_syncer_fcb.h | 10 ++ 2 files changed, 129 insertions(+), 8 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 9202c33dc2c48..1f4a568a51b6f 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -31,10 +31,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "initial_syncer_fcb.h" -#include "mongo/client/dbclient_cursor.h" -#include "mongo/db/dbdirectclient.h" -#include "mongo/db/pipeline/aggregate_command_gen.h" -#include "mongo/db/storage/storage_options.h" #include #include #include @@ -49,14 +45,19 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/base/error_codes.h" #include "mongo/base/status.h" #include "mongo/bson/bsonmisc.h" +#include "mongo/bson/bsonobj.h" #include "mongo/bson/bsonobjbuilder.h" #include "mongo/bson/timestamp.h" +#include "mongo/client/dbclient_cursor.h" #include "mongo/client/fetcher.h" +#include "mongo/client/remote_command_retry_scheduler.h" #include "mongo/db/client.h" #include "mongo/db/database_name.h" +#include "mongo/db/dbdirectclient.h" #include "mongo/db/feature_compatibility_version_parser.h" #include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" +#include "mongo/db/pipeline/aggregate_command_gen.h" #include "mongo/db/repl/all_database_cloner.h" #include "mongo/db/repl/initial_sync_state.h" #include "mongo/db/repl/initial_syncer_common_stats.h" @@ -70,9 +71,13 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/sync_source_selector.h" #include "mongo/db/repl/tenant_migration_access_blocker_util.h" #include "mongo/db/repl/transaction_oplog_application.h" +#include "mongo/db/server_options.h" #include "mongo/db/serverless/serverless_operation_lock_registry.h" +#include "mongo/db/service_context.h" #include "mongo/db/storage/storage_engine.h" #include "mongo/db/storage/storage_engine_init.h" +#include "mongo/db/storage/storage_options.h" +#include "mongo/executor/remote_command_request.h" #include "mongo/executor/task_executor.h" #include "mongo/logv2/log.h" #include "mongo/platform/compiler.h" // IWYU pragma: keep @@ -288,6 +293,7 @@ void InitialSyncerFCB::cancelCurrentAttempt() { void InitialSyncerFCB::_cancelRemainingWork_inlock() { _cancelHandle_inlock(_startInitialSyncAttemptHandle); _cancelHandle_inlock(_chooseSyncSourceHandle); + _cancelHandle_inlock(_fetchBackupCursorHandle); _cancelHandle_inlock(_getLastRollbackIdHandle); if (_sharedData) { @@ -301,6 +307,7 @@ void InitialSyncerFCB::_cancelRemainingWork_inlock() { _client->shutdownAndDisallowReconnect(); } _shutdownComponent_inlock(_applier); + _shutdownComponent_inlock(_backupCursorFetcher); _shutdownComponent_inlock(_fCVFetcher); _shutdownComponent_inlock(_beginFetchingOpTimeFetcher); (*_attemptExec)->shutdown(); @@ -685,10 +692,18 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( } LOGV2_DEBUG( 128406, 2, "Retrieved names of local files", "number"_attr = bfiles.getValue().size()); - // TODO: this is temporary cancelation of initial syn for debugging reasons - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, {ErrorCodes::NotImplemented, "cancel FCBIS for debugging reason"}); + // schedule $backupCursor on the sync source + status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _fetchBackupCursorCallback(args, onCompletionGuard); + }, + &_fetchBackupCursorHandle, + str::stream() << "_fetchBackupCursorCallback-" << chooseSyncSourceAttempt); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } } catch (const DBException&) { // Report exception as an initial syncer failure. stdx::unique_lock lock(_mutex); @@ -1190,6 +1205,8 @@ StatusWith InitialSyncerFCB::_chooseSyncSource_inlock() { namespace { +constexpr int kBackupCursorFileFetcherRetryAttempts = 10; + void moveFile(const std::string& src, const std::string& dst) { LOGV2_DEBUG(128401, 1, "Moving file", "from"_attr = src, "to"_attr = dst); @@ -1237,7 +1254,6 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { try { // Open a local backup cursor and obtain a list of files from that. // TODO: ensure _attemptExec usage is correct - //_client->getServerHostAndPort(); // Try to use DBDirectClient auto opCtx = makeOpCtx(); @@ -1303,6 +1319,101 @@ Status InitialSyncerFCB::_switchStorageLocation(const std::string& newLocation) return Status::OK(); } +void InitialSyncerFCB::_fetchBackupCursorCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + callbackArgs, "error executing backup cusrsor on the sync source"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + const auto aggregateCommandRequestObj = [] { + AggregateCommandRequest aggRequest( + NamespaceString::makeCollectionlessAggregateNSS(DatabaseName::kAdmin), + {BSON("$backupCursor" << BSONObj())}); + // We must set a writeConcern on internal commands. + aggRequest.setWriteConcern(WriteConcernOptions()); + return aggRequest.toBSON(BSONObj()); + }(); + + LOGV2_DEBUG(128407, 1, "Opening backup cursor on sync source"); + + auto fetchStatus = std::make_shared>(); + const auto fetcherCallback = [fetchStatus](const Fetcher::QueryResponseStatus& dataStatus, + Fetcher::NextAction* nextAction, + BSONObjBuilder* getMoreBob) noexcept { + try { + uassertStatusOK(dataStatus); + + const auto& data = dataStatus.getValue(); + for (const BSONObj& doc : data.documents) { + if (doc["metadata"]) { + // First batch must contain the metadata. + const auto& metadata = doc["metadata"].Obj(); + auto checkpointTimestamp = metadata["checkpointTimestamp"].timestamp(); + + LOGV2_INFO(128409, + "Opened backup cursor on sync source", + "backupCursorId"_attr = data.cursorId, + "backupCursorCheckpointTimestamp"_attr = checkpointTimestamp); + // TODO: + } else { + LOGV2_DEBUG(128410, + 1, + "Backup cursor entry", + "filename"_attr = doc["filename"].String(), + "backupCursorId"_attr = data.cursorId); + // TODO: + } + } + + *fetchStatus = Status::OK(); + if (!getMoreBob || data.documents.empty()) { + // Exit fetcher but keep the backupCursor alive to prevent WT on sync source + // from modifying file bytes. backupCursor can be closed after all files are + // copied + *nextAction = Fetcher::NextAction::kExitAndKeepCursorAlive; + return; + } + + getMoreBob->append("getMore", data.cursorId); + getMoreBob->append("collection", data.nss.coll()); + } catch (DBException& ex) { + LOGV2_ERROR( + 128408, "Error fetching backup cursor entries", "error"_attr = ex.toString()); + *fetchStatus = ex.toStatus(); + } + }; + + _backupCursorFetcher = std::make_unique( + *_attemptExec, + _syncSource, + DatabaseName::kAdmin.toString(), + aggregateCommandRequestObj, + fetcherCallback, + // ReadPreferenceSetting::secondaryPreferredMetadata(), + ReadPreferenceSetting(ReadPreference::PrimaryPreferred).toContainingBSON(), + executor::RemoteCommandRequest::kNoTimeout, + executor::RemoteCommandRequest::kNoTimeout, + RemoteCommandRetryScheduler::makeRetryPolicy( + kBackupCursorFileFetcherRetryAttempts, executor::RemoteCommandRequest::kNoTimeout)); + + Status scheduleStatus = _backupCursorFetcher->schedule(); + if (!scheduleStatus.isOK()) { + _backupCursorFetcher.reset(); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, scheduleStatus); + return; + } +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + void InitialSyncerFCB::_fcbisDraft() { // Switch storage to be pointing to the set of downloaded files _switchStorageLocation(storageGlobalParams.dbpath + ".initialsync"); diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 9ab3fd7e497d9..0c62aae4d7bac 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -314,6 +314,12 @@ class InitialSyncerFCB : public InitialSyncerInterface { std::uint32_t chooseSyncSourceMaxAttempts, std::shared_ptr onCompletionGuard) noexcept; + /** + * Callback to execute backup cursor on the sync source + */ + void _fetchBackupCursorCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + /** * This function does the following: * 1.) Truncate oplog. @@ -466,6 +472,9 @@ class InitialSyncerFCB : public InitialSyncerInterface { // Handle to currently scheduled _chooseSyncSourceCallback() task. executor::TaskExecutor::CallbackHandle _chooseSyncSourceHandle; // (M) + // Handle to currently scheduled _fetchBackupCursorCallback() task. + executor::TaskExecutor::CallbackHandle _fetchBackupCursorHandle; // (M) + // RollbackChecker to get rollback ID before and after each initial sync attempt. std::unique_ptr _rollbackChecker; // (M) @@ -478,6 +487,7 @@ class InitialSyncerFCB : public InitialSyncerInterface { std::unique_ptr _initialSyncState; // (M) std::unique_ptr _beginFetchingOpTimeFetcher; // (S) std::unique_ptr _fCVFetcher; // (S) + std::unique_ptr _backupCursorFetcher; // (S) std::unique_ptr _applier; // (M) HostAndPort _syncSource; // (M) std::unique_ptr _client; // (M) From b905f2ea08003951957b400ab44394f80cfe55dc Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 10 Jun 2024 01:51:37 +0100 Subject: [PATCH 08/32] PSMDB-1284 implement file cloning from the sync source --- src/mongo/db/repl/SConscript | 3 + src/mongo/db/repl/fcb_file_cloner.cpp | 312 +++++++++++++++++++++++ src/mongo/db/repl/fcb_file_cloner.h | 228 +++++++++++++++++ src/mongo/db/repl/initial_syncer_fcb.cpp | 215 ++++++++++++++-- src/mongo/db/repl/initial_syncer_fcb.h | 28 ++ 5 files changed, 766 insertions(+), 20 deletions(-) create mode 100644 src/mongo/db/repl/fcb_file_cloner.cpp create mode 100644 src/mongo/db/repl/fcb_file_cloner.h diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 699dc9df13c85..ccfa0f6c5d55f 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1251,6 +1251,7 @@ env.Library( env.Library( target='initial_syncer', source=[ + 'fcb_file_cloner.cpp', 'initial_syncer.cpp', 'initial_syncer_common_stats.cpp', 'initial_syncer_factory.cpp', @@ -1281,7 +1282,9 @@ env.Library( '$BUILD_DIR/mongo/db/session/session_catalog_mongod', '$BUILD_DIR/mongo/db/storage/storage_engine_common', '$BUILD_DIR/mongo/executor/scoped_task_executor', + '$BUILD_DIR/mongo/util/progress_meter', 'repl_server_parameters', + 'replication_auth', ], ) diff --git a/src/mongo/db/repl/fcb_file_cloner.cpp b/src/mongo/db/repl/fcb_file_cloner.cpp new file mode 100644 index 0000000000000..30b7baf4a827a --- /dev/null +++ b/src/mongo/db/repl/fcb_file_cloner.cpp @@ -0,0 +1,312 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + +#include "fcb_file_cloner.h" + +#include +#include + +#include + +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonelement.h" +#include "mongo/bson/bsonmisc.h" +#include "mongo/bson/bsontypes.h" +#include "mongo/client/dbclient_connection.h" +#include "mongo/db/database_name.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/pipeline/aggregate_command_gen.h" +#include "mongo/db/pipeline/aggregation_request_helper.h" +#include "mongo/db/repl/read_concern_args.h" +#include "mongo/db/storage/storage_options.h" +#include "mongo/db/write_concern_options.h" +#include "mongo/platform/mutex.h" +#include "mongo/stdx/mutex.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/clock_source.h" +#include "mongo/util/concurrency/with_lock.h" +#include "mongo/util/net/hostandport.h" +#include "mongo/util/str.h" + + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kReplicationInitialSync + + +namespace mongo::repl { + +FCBFileCloner::FCBFileCloner(const UUID& backupId, + const std::string& remoteFileName, + size_t remoteFileSize, + const std::string& relativePath, + InitialSyncSharedData* sharedData, + const HostAndPort& source, + DBClientConnection* client, + StorageInterface* storageInterface, + ThreadPool* dbPool) + : BaseCloner("FCBFileCloner"_sd, sharedData, source, client, storageInterface, dbPool), + _backupId(backupId), + _remoteFileName(remoteFileName), + _remoteFileSize(remoteFileSize), + _relativePathString(relativePath), + _queryStage("query", this, &FCBFileCloner::queryStage), + _fsWorkTaskRunner(dbPool), + _scheduleFsWorkFn([this](executor::TaskExecutor::CallbackFn work) { + auto task = [this, work = std::move(work)]( + OperationContext* opCtx, + const Status& status) mutable noexcept -> TaskRunner::NextAction { + try { + work(executor::TaskExecutor::CallbackArgs(nullptr, {}, status, opCtx)); + } catch (const DBException& e) { + setSyncFailedStatus(e.toStatus()); + } + return TaskRunner::NextAction::kDisposeOperationContext; + }; + _fsWorkTaskRunner.schedule(std::move(task)); + return executor::TaskExecutor::CallbackHandle(); + }), + _progressMeter(remoteFileSize, + kProgressMeterSecondsBetween, + kProgressMeterCheckInterval, + "bytes copied", + str::stream() << _remoteFileName << " FCB file clone progress") { + _stats.filePath = _relativePathString; + _stats.fileSize = _remoteFileSize; +} + +BaseCloner::ClonerStages FCBFileCloner::getStages() { + return {&_queryStage}; +} + +void FCBFileCloner::preStage() { + stdx::lock_guard lk(_mutex); + _stats.start = getSharedData()->getClock()->now(); + + // Construct local path name from the relative path and the temp dbpath. + boost::filesystem::path relativePath(_relativePathString); + uassert(6113300, + str::stream() << "Path " << _relativePathString << " should be a relative path", + relativePath.is_relative()); + + boost::filesystem::path syncTargetTempDBPath{storageGlobalParams.dbpath}; + syncTargetTempDBPath /= ".initialsync"; + _localFilePath = syncTargetTempDBPath; + + _localFilePath /= relativePath; + _localFilePath = _localFilePath.lexically_normal(); + uassert(6113301, + str::stream() << "Path " << _relativePathString + << " must not escape its parent directory.", + StringData(_localFilePath.generic_string()) + .startsWith(syncTargetTempDBPath.generic_string())); + + // Create and open files and any parent directories. + if (boost::filesystem::exists(_localFilePath)) { + LOGV2(6113302, + "Local file exists at start of FCBFileCloner; truncating.", + "localFilePath"_attr = _localFilePath.string()); + } else { + auto localFileDir = _localFilePath.parent_path(); + boost::system::error_code ec; + boost::filesystem::create_directories(localFileDir, ec); + uassert(6113303, + str::stream() << "Failed to create directory " << localFileDir.string() << " Error " + << ec.message(), + !ec); + } + _localFile.open(_localFilePath.string(), + std::ios_base::out | std::ios_base::binary | std::ios_base::trunc); + uassert(ErrorCodes::FileOpenFailed, + str::stream() << "Failed to open file " << _localFilePath.string(), + !_localFile.fail()); + _fileOffset = 0; +} + +void FCBFileCloner::postStage() { + _localFile.close(); + stdx::lock_guard lk(_mutex); + _stats.end = getSharedData()->getClock()->now(); +} + +BaseCloner::AfterStageBehavior FCBFileCloner::queryStage() { + // Since the query stage may be re-started, we need to make sure all the file system work + // from the previous run is done before running the query again. + waitForFilesystemWorkToComplete(); + _sawEof = false; + runQuery(); + waitForFilesystemWorkToComplete(); + uassert( + 6113304, + str::stream() + << "Received entire file, but did not get end of file marker. File may be incomplete " + << _localFilePath.string(), + _sawEof); + return kContinueNormally; +} + +size_t FCBFileCloner::getFileOffset() { + stdx::lock_guard lk(_mutex); + return _fileOffset; +} + +void FCBFileCloner::runQuery() { + auto backupFileStage = BSON( + "$_backupFile" << BSON("backupId" << _backupId << "file" << _remoteFileName << "byteOffset" + << static_cast(getFileOffset()))); + AggregateCommandRequest aggRequest( + NamespaceString::makeCollectionlessAggregateNSS(DatabaseName::kAdmin), {backupFileStage}); + aggRequest.setReadConcern(ReadConcernArgs::kImplicitDefault); + aggRequest.setWriteConcern(WriteConcernOptions()); + + LOGV2_DEBUG(6113305, + 2, + "FCBFileCloner running aggregation", + "source"_attr = getSource(), + "aggRequest"_attr = aggregation_request_helper::serializeToCommandObj(aggRequest)); + const bool useExhaust = + true; // TODO: !MONGO_unlikely(FCBFileClonerDisableExhaust.shouldFail()); + std::unique_ptr cursor = uassertStatusOK(DBClientCursor::fromAggregationRequest( + getClient(), std::move(aggRequest), true /* secondaryOk */, useExhaust)); + try { + while (cursor->more()) { + handleNextBatch(*cursor); + } + } catch (const DBException& e) { + // We cannot continue after an error when processing exhaust cursors. Instead we must + // reconnect, which is handled by the BaseCloner. + LOGV2_DEBUG(6113306, + 1, + "FCBFileCloner received an exception while downloading data", + "error"_attr = e.toStatus(), + "source"_attr = getSource(), + "backupId"_attr = _backupId, + "remoteFile"_attr = _remoteFileName, + "fileOffset"_attr = getFileOffset()); + getClient()->shutdown(); + throw; + } +} + +void FCBFileCloner::handleNextBatch(DBClientCursor& cursor) { + LOGV2_DEBUG(6113307, + 4, + "FCBFileCloner handleNextBatch", + "source"_attr = getSource(), + "backupId"_attr = _backupId, + "remoteFile"_attr = _remoteFileName, + "fileOffset"_attr = getFileOffset()); + { + stdx::lock_guard lk(*getSharedData()); + if (!getSharedData()->getStatus(lk).isOK()) { + static constexpr char const* message = + "BackupFile cloning cancelled due to cloning failure"; + LOGV2(6113323, message, "error"_attr = getSharedData()->getStatus(lk)); + uasserted(ErrorCodes::CallbackCanceled, + str::stream() << message << ": " << getSharedData()->getStatus(lk)); + } + } + { + stdx::lock_guard lk(_mutex); + _stats.receivedBatches++; + while (cursor.moreInCurrentBatch()) { + _dataToWrite.emplace_back(cursor.nextSafe()); + } + } + + // Schedule the next set of writes. + auto&& scheduleResult = + _scheduleFsWorkFn([this](const executor::TaskExecutor::CallbackArgs& cbd) { + writeDataToFilesystemCallback(cbd); + }); + + if (!scheduleResult.isOK()) { + Status newStatus = scheduleResult.getStatus().withContext( + str::stream() << "Error copying file '" << _remoteFileName << "'"); + // We must throw an exception to terminate query. + uassertStatusOK(newStatus); + } +} + +void FCBFileCloner::writeDataToFilesystemCallback(const executor::TaskExecutor::CallbackArgs& cbd) { + LOGV2_DEBUG(6113309, + 4, + "FCBFileCloner writeDataToFilesystemCallback", + "backupId"_attr = _backupId, + "remoteFile"_attr = _remoteFileName, + "localFile"_attr = _localFilePath.string(), + "fileOffset"_attr = getFileOffset()); + uassertStatusOK(cbd.status); + { + stdx::lock_guard lk(_mutex); + if (_dataToWrite.empty()) { + LOGV2_WARNING(6113310, + "writeDataToFilesystemCallback, but no data to write", + "remoteFile"_attr = _remoteFileName); + } + for (const auto& doc : _dataToWrite) { + uassert(6113311, + str::stream() << "Saw multiple end-of-file-markers in file " << _remoteFileName, + !_sawEof); + // Received file data should always be in sync with the stream and where we think + // our next input should be coming from. + const auto byteOffset = doc["byteOffset"].safeNumberLong(); + invariant(byteOffset == _localFile.tellp()); + invariant(byteOffset == _fileOffset); + const auto& dataElem = doc["data"]; + uassert(6113312, + str::stream() << "Expected file data to be type BinDataGeneral. " << doc, + dataElem.type() == BinData && dataElem.binDataType() == BinDataGeneral); + int dataLength = 0; + const char* data = dataElem.binData(dataLength); + _localFile.write(data, dataLength); + uassert(ErrorCodes::FileStreamFailed, + str::stream() << "Unable to write file data for file " << _remoteFileName + << " at offset " << _fileOffset, + !_localFile.fail()); + _progressMeter.hit(dataLength); + _fileOffset += dataLength; + _stats.bytesCopied += dataLength; + _sawEof = doc["endOfFile"].booleanSafe(); + } + _dataToWrite.clear(); + _stats.writtenBatches++; + } +} + +void FCBFileCloner::waitForFilesystemWorkToComplete() { + _fsWorkTaskRunner.join(); +} + +logv2::LogComponent FCBFileCloner::getLogComponent() { + return logv2::LogComponent::kReplicationInitialSync; +} + +} // namespace mongo::repl diff --git a/src/mongo/db/repl/fcb_file_cloner.h b/src/mongo/db/repl/fcb_file_cloner.h new file mode 100644 index 0000000000000..d12f458352575 --- /dev/null +++ b/src/mongo/db/repl/fcb_file_cloner.h @@ -0,0 +1,228 @@ +/*====== +This file is part of Percona Server for MongoDB. + +Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. + + This program is free software: you can redistribute it and/or modify + it under the terms of the Server Side Public License, version 1, + as published by MongoDB, Inc. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Server Side Public License for more details. + + You should have received a copy of the Server Side Public License + along with this program. If not, see + . + + As a special exception, the copyright holders give permission to link the + code of portions of this program with the OpenSSL library under certain + conditions as described in each individual source file and distribute + linked combinations including the program with the OpenSSL library. You + must comply with the Server Side Public License in all respects for + all of the code used other than as permitted herein. If you modify file(s) + with this exception, you may extend this exception to your version of the + file(s), but you are not obligated to do so. If you do not wish to do so, + delete this exception statement from your version. If you delete this + exception statement from all source files in the program, then also delete + it in the license file. +======= */ + + +#pragma once + +#include +#include +#include + +#include + +#include "mongo/base/checked_cast.h" +#include "mongo/base/error_codes.h" +#include "mongo/base/status.h" +#include "mongo/base/status_with.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/client/dbclient_connection.h" +#include "mongo/client/dbclient_cursor.h" +#include "mongo/db/repl/base_cloner.h" +#include "mongo/db/repl/initial_sync_shared_data.h" +#include "mongo/db/repl/storage_interface.h" +#include "mongo/db/repl/task_runner.h" +#include "mongo/executor/task_executor.h" +#include "mongo/util/concurrency/thread_pool.h" +#include "mongo/util/functional.h" +#include "mongo/util/net/hostandport.h" +#include "mongo/util/progress_meter.h" +#include "mongo/util/time_support.h" +#include "mongo/util/uuid.h" + + +namespace mongo::repl { + +class FCBFileCloner final : public BaseCloner { +public: + struct Stats { + std::string filePath; + size_t fileSize; + Date_t start; + Date_t end; + size_t receivedBatches{0}; + size_t writtenBatches{0}; + size_t bytesCopied{0}; + + std::string toString() const; + BSONObj toBSON() const; + void append(BSONObjBuilder* builder) const; + }; + + /** + * Type of function to schedule file system tasks with the executor. + */ + using ScheduleFsWorkFn = unique_function( + executor::TaskExecutor::CallbackFn)>; + + /** + * Constructor for FCBFileCloner + * + * remoteFileName: Path of file to copy on remote system. + * remoteFileSize: Size of remote file in bytes, used for progress messages and stats only. + * relativePath: Path of file relative to dbpath on the remote system, as a + * boost::filesystem::path generic path. + */ + FCBFileCloner(const UUID& backupId, + const std::string& remoteFileName, + size_t remoteFileSize, + const std::string& relativePath, + InitialSyncSharedData* sharedData, + const HostAndPort& source, + DBClientConnection* client, + StorageInterface* storageInterface, + ThreadPool* dbPool); + + ~FCBFileCloner() override = default; + + /** + * Waits for any file system work to finish or fail. + */ + void waitForFilesystemWorkToComplete(); + +protected: + InitialSyncSharedData* getSharedData() const override { + return checked_cast(BaseCloner::getSharedData()); + } + + ClonerStages getStages() final; + +private: + class FCBFileClonerQueryStage : public ClonerStage { + public: + FCBFileClonerQueryStage(std::string name, FCBFileCloner* cloner, ClonerRunFn stageFunc) + : ClonerStage(std::move(name), cloner, stageFunc) {} + + bool checkSyncSourceValidityOnRetry() override { + // Sync source validity is assured by the backup ID not existing if the sync source + // is restarted or otherwise becomes invalid. + return false; + } + + bool isTransientError(const Status& status) override { + if (isCursorError(status)) { + return true; + } + return ErrorCodes::isRetriableError(status); + } + + static bool isCursorError(const Status& status) { + // Our cursor was killed on the sync source. + return (status == ErrorCodes::CursorNotFound) || + (status == ErrorCodes::OperationFailed) || (status == ErrorCodes::QueryPlanKilled); + } + }; + + /** + * Overriden to allow the BaseCloner to use the initial syncer log component. + */ + logv2::LogComponent getLogComponent() override; + + // TODO: do we need Stats/getStats in this class? + /** + * The preStage sets the begin time in _stats and makes sure the destination file + * can be created. + */ + void preStage() final; + + /** + * The postStage sets the end time in _stats. + */ + void postStage() final; + + + /** + * Stage function that executes a query to retrieve the file data. For each + * batch returned by the upstream node, handleNextBatch will be called with the data. This + * stage will finish when the entire query is finished or failed. + */ + AfterStageBehavior queryStage(); + + /** + * Put all results from a query batch into a buffer, and schedule it to be written to disk. + */ + void handleNextBatch(DBClientCursor& cursor); + + /** + * Called whenever there is a new batch of documents ready from the DBClientConnection. + * + * Each document returned will be inserted via the storage interfaceRequest storage + * interface. + */ + void writeDataToFilesystemCallback(const executor::TaskExecutor::CallbackArgs& cbd); + + /** + * Sends an (aggregation) query command to the source. That query command with be parameterized + * based on copy progress. + */ + void runQuery(); + + /** + * Convenience call to get the file offset under a lock. + */ + size_t getFileOffset(); + + // All member variables are labeled with one of the following codes indicating the + // synchronization rules for accessing them. + // + // (R) Read-only in concurrent operation; no synchronization required. + // (S) Self-synchronizing; access according to class's own rules. + // (M) Reads and writes guarded by _mutex (defined in base class). + // (X) Access only allowed from the main flow of control called from run() or constructor. + const UUID _backupId; // (R) + const std::string _remoteFileName; // (R) + size_t _remoteFileSize; // (R) + const std::string _relativePathString; // (R) + boost::filesystem::path _localFilePath; // (X) + + FCBFileClonerQueryStage _queryStage; // (R) + + std::ofstream _localFile; // (M) + // File offset we will request from the remote side in the next query. + off_t _fileOffset = 0; // (M) + bool _sawEof = false; // (X) + + // Data read from source to insert. + std::vector _dataToWrite; // (M) + // Putting _fsWorkTaskRunner last ensures anything the database work threads depend on + // like _dataToWrite, is destroyed after those threads exit. + TaskRunner _fsWorkTaskRunner; // (R) + // Function for scheduling filesystem work using the executor. + ScheduleFsWorkFn _scheduleFsWorkFn; // (R) + + ProgressMeter _progressMeter; // (X) progress meter for this instance. + Stats _stats; // (M) + + static constexpr int kProgressMeterSecondsBetween = 60; + static constexpr int kProgressMeterCheckInterval = 128; +}; + +} // namespace mongo::repl diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 1f4a568a51b6f..5feaeec4bee33 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -59,12 +59,14 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/namespace_string.h" #include "mongo/db/pipeline/aggregate_command_gen.h" #include "mongo/db/repl/all_database_cloner.h" +#include "mongo/db/repl/fcb_file_cloner.h" #include "mongo/db/repl/initial_sync_state.h" #include "mongo/db/repl/initial_syncer_common_stats.h" #include "mongo/db/repl/initial_syncer_factory.h" #include "mongo/db/repl/initial_syncer_interface.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/repl_server_parameters_gen.h" +#include "mongo/db/repl/replication_auth.h" #include "mongo/db/repl/replication_consistency_markers.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/storage_interface.h" @@ -134,6 +136,12 @@ using QueryResponseStatus = StatusWith; using UniqueLock = stdx::unique_lock; using LockGuard = stdx::lock_guard; +constexpr StringData kMetadataFieldName = "metadata"_sd; +constexpr StringData kBackupIdFieldName = "backupId"_sd; +constexpr StringData kDBPathFieldName = "dbpath"_sd; +constexpr StringData kFileNameFieldName = "filename"_sd; +constexpr StringData kFileSizeFieldName = "fileSize"_sd; + // Used to reset the oldest timestamp during initial sync to a non-null timestamp. const Timestamp kTimestampOne(0, 1); @@ -141,6 +149,25 @@ ServiceContext::UniqueOperationContext makeOpCtx() { return cc().makeOperationContext(); } +/** + * Computes a boost::filesystem::path generic-style relative path (always uses slashes) + * from a base path and a relative path. + */ +std::string getPathRelativeTo(const std::string& path, const std::string& basePath) { + if (basePath.empty() || path.find(basePath) != 0) { + uasserted(6113319, + str::stream() << "The file " << path << " is not a subdirectory of " << basePath); + } + + auto result = path.substr(basePath.size()); + // Skip separators at the beginning of the relative part. + if (!result.empty() && (result[0] == '/' || result[0] == '\\')) { + result.erase(result.begin()); + } + + std::replace(result.begin(), result.end(), '\\', '/'); + return result; +} } // namespace const ServiceContext::ConstructorActionRegisterer initialSyncerRegistererFCB( @@ -179,6 +206,7 @@ InitialSyncerFCB::InitialSyncerFCB( _writerPool(writerPool), _storage(storage), _replicationProcess(replicationProcess), + _backupId(UUID::fromCDR(std::array{})), _onCompletion(onCompletion), _createClientFn( [] { return std::make_unique(true /* autoReconnect */); }) { @@ -293,7 +321,9 @@ void InitialSyncerFCB::cancelCurrentAttempt() { void InitialSyncerFCB::_cancelRemainingWork_inlock() { _cancelHandle_inlock(_startInitialSyncAttemptHandle); _cancelHandle_inlock(_chooseSyncSourceHandle); + _cancelHandle_inlock(_getBaseRollbackIdHandle); _cancelHandle_inlock(_fetchBackupCursorHandle); + _cancelHandle_inlock(_transferFileHandle); _cancelHandle_inlock(_getLastRollbackIdHandle); if (_sharedData) { @@ -682,9 +712,7 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( return; } - _syncSource = syncSource.getValue(); - - LOGV2_DEBUG(128404, 2, "Reading the list of local files via $backupCUrsor"); + LOGV2_DEBUG(128404, 2, "Reading the list of local files via $backupCursor"); auto bfiles = _getBackupFiles(); if (!bfiles.isOK()) { LOGV2_DEBUG( @@ -693,23 +721,26 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( LOGV2_DEBUG( 128406, 2, "Retrieved names of local files", "number"_attr = bfiles.getValue().size()); - // schedule $backupCursor on the sync source - status = _scheduleWorkAndSaveHandle_inlock( - [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { - _fetchBackupCursorCallback(args, onCompletionGuard); - }, - &_fetchBackupCursorHandle, - str::stream() << "_fetchBackupCursorCallback-" << chooseSyncSourceAttempt); + _syncSource = syncSource.getValue(); + + // Schedule rollback ID checker. + _rollbackChecker = std::make_unique(*_attemptExec, _syncSource); + auto scheduleResult = _rollbackChecker->reset([=](const RollbackChecker::Result& result) { + return _rollbackCheckerResetCallback(result, onCompletionGuard); + }); + status = scheduleResult.getStatus(); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; } + _getBaseRollbackIdHandle = scheduleResult.getValue(); } catch (const DBException&) { // Report exception as an initial syncer failure. stdx::unique_lock lock(_mutex); onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); } +// TODO: we probably don't need this in FCBIS Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { // truncate oplog; drop user databases. LOGV2_DEBUG(4540700, @@ -762,6 +793,36 @@ Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { return _storage->dropReplicatedDatabases(opCtx.get()); } +void InitialSyncerFCB::_rollbackCheckerResetCallback( + const RollbackChecker::Result& result, std::shared_ptr onCompletionGuard) { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), + "error while getting base rollback ID"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // we will need shared data to clone files from sync source + _sharedData = + std::make_unique(_rollbackChecker->getBaseRBID(), + _allowedOutageDuration, + getGlobalServiceContext()->getFastClockSource()); + _client = _createClientFn(); + + // schedule $backupCursor on the sync source + status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _fetchBackupCursorCallback(args, onCompletionGuard); + }, + &_fetchBackupCursorHandle, + "_fetchBackupCursorCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} + void InitialSyncerFCB::_fcvFetcherCallback(const StatusWith& result, std::shared_ptr onCompletionGuard, const OpTime& lastOpTime, @@ -1253,7 +1314,6 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { std::vector files; try { // Open a local backup cursor and obtain a list of files from that. - // TODO: ensure _attemptExec usage is correct // Try to use DBDirectClient auto opCtx = makeOpCtx(); @@ -1269,7 +1329,7 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { while (cursor->more()) { auto rec = cursor->next(); logd("isoldbg: {}", rec.toString()); - files.emplace_back(rec["filename"_sd].String()); + files.emplace_back(rec[kFileNameFieldName].String()); } // BSONObj result; @@ -1319,6 +1379,8 @@ Status InitialSyncerFCB::_switchStorageLocation(const std::string& newLocation) return Status::OK(); } +// TenantMigrationRecipientService::Instance::_openBackupCursor +// ShardMergeRecipientService::Instance::_openBackupCursor void InitialSyncerFCB::_fetchBackupCursorCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) @@ -1343,31 +1405,38 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( LOGV2_DEBUG(128407, 1, "Opening backup cursor on sync source"); auto fetchStatus = std::make_shared>(); - const auto fetcherCallback = [fetchStatus](const Fetcher::QueryResponseStatus& dataStatus, - Fetcher::NextAction* nextAction, - BSONObjBuilder* getMoreBob) noexcept { + const auto fetcherCallback = [this, fetchStatus](const Fetcher::QueryResponseStatus& dataStatus, + Fetcher::NextAction* nextAction, + BSONObjBuilder* getMoreBob) noexcept { try { uassertStatusOK(dataStatus); const auto& data = dataStatus.getValue(); for (const BSONObj& doc : data.documents) { - if (doc["metadata"]) { + if (doc[kMetadataFieldName]) { // First batch must contain the metadata. - const auto& metadata = doc["metadata"].Obj(); + const auto& metadata = doc[kMetadataFieldName].Obj(); auto checkpointTimestamp = metadata["checkpointTimestamp"].timestamp(); + _backupId = UUID(uassertStatusOK(UUID::parse(metadata[kBackupIdFieldName]))); + _remoteDBPath = metadata[kDBPathFieldName].String(); LOGV2_INFO(128409, "Opened backup cursor on sync source", "backupCursorId"_attr = data.cursorId, + "remoteDBPath"_attr = _remoteDBPath, "backupCursorCheckpointTimestamp"_attr = checkpointTimestamp); - // TODO: + // empty _remoteFiles on new sync attempt start + _remoteFiles.clear(); } else { + auto fileName = doc[kFileNameFieldName].String(); + auto fileSize = doc[kFileSizeFieldName].numberLong(); LOGV2_DEBUG(128410, 1, "Backup cursor entry", - "filename"_attr = doc["filename"].String(), + "filename"_attr = fileName, + "fileSize"_attr = fileSize, "backupCursorId"_attr = data.cursorId); - // TODO: + _remoteFiles.emplace_back(fileName, fileSize); } } @@ -1408,6 +1477,112 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, scheduleStatus); return; } + + _backupCursorFetcher->onCompletion() + .thenRunOn(**_attemptExec) + .then([this, fetchStatus, onCompletionGuard, &lock] { + logd("Backup cursor fetcher completion callback"); + if (!*fetchStatus) { + // the callback was never invoked + uasserted(128411, "Internal error running cursor callback in command"); + } + uassertStatusOK(fetchStatus->get()); + + uassert(128414, + "Internal error: no file names collected from sync source", + !_remoteFiles.empty()); + + // schedule file transfer callback + auto status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _transferFileCallback(args, 0lu, onCompletionGuard); + }, + &_transferFileHandle, + str::stream() << "_transferFileCallback-" << 0); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + }) + .wait(); + +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +// tenant_migration_shard_merge_util.cpp : cloneFile +void InitialSyncerFCB::_transferFileCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::size_t fileIdx, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock( + callbackArgs, "error transferring file from sync source"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // create connection to the sync source + DBClientConnection syncSourceConn{true /* autoReconnect */}; + status = syncSourceConn.connect(_syncSource, "File copy-based initial sync", boost::none); + if (status.isOK()) { + status = replAuthenticate(&syncSourceConn) + .withContext(str::stream() << "Failed to authenticate to " << _syncSource); + } + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // execute remote request + std::string remoteFileName = _remoteFiles[fileIdx].name; + size_t remoteFileSize = _remoteFiles[fileIdx].size; + auto currentBackupFileCloner = + std::make_unique(_backupId, + remoteFileName, + remoteFileSize, + getPathRelativeTo(remoteFileName, _remoteDBPath), + _sharedData.get(), + _syncSource, + &syncSourceConn, + _storage, + _writerPool); + auto cloneStatus = currentBackupFileCloner->run(); + if (!cloneStatus.isOK()) { + LOGV2_WARNING(128412, + "Failed to clone file", + "fileName"_attr = remoteFileName, + "error"_attr = cloneStatus); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, cloneStatus); + } else { + LOGV2_DEBUG(128413, 1, "Cloned file", "fileName"_attr = remoteFileName); + auto nextFileIdx = fileIdx + 1; + if (nextFileIdx < _remoteFiles.size()) { + // schedule next file cloning + auto status = _scheduleWorkAndSaveHandle_inlock( + [this, nextFileIdx, onCompletionGuard]( + const executor::TaskExecutor::CallbackArgs& args) { + _transferFileCallback(args, nextFileIdx, onCompletionGuard); + }, + &_transferFileHandle, + str::stream() << "_transferFileCallback-" << nextFileIdx); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + } else { + // TODO: all files are cloned - close backup cursor and schedule next step + // TODO: this is temporary cancelation of initial sync for debugging reasons + onCompletionGuard->setResultAndCancelRemainingWork_inlock( + lock, + {ErrorCodes::NotImplemented, + "All files cloned; cancel FCBIS for debugging reason"}); + } + } } catch (const DBException&) { // Report exception as an initial syncer failure. stdx::unique_lock lock(_mutex); diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 0c62aae4d7bac..9525d4c8d118b 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -32,6 +32,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #pragma once +#include #include #include #include @@ -42,6 +43,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/base/string_data.h" #include "mongo/bson/bsonobj.h" #include "mongo/bson/bsonobjbuilder.h" +#include "mongo/client/dbclient_connection.h" #include "mongo/client/fetcher.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/callback_completion_guard.h" @@ -61,6 +63,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/util/duration.h" #include "mongo/util/net/hostandport.h" #include "mongo/util/time_support.h" +#include "mongo/util/uuid.h" namespace mongo { namespace repl { @@ -178,6 +181,14 @@ class InitialSyncerFCB : public InitialSyncerInterface { void setAllowedOutageDuration_forTest(Milliseconds allowedOutageDuration); private: + /** + * Attributes of remote file received from $backupCursor + */ + struct BackupFile { + std::string name; + size_t size; + }; + /** * Returns true if we are still processing initial sync tasks (_state is either Running or * Shutdown). @@ -320,6 +331,13 @@ class InitialSyncerFCB : public InitialSyncerInterface { void _fetchBackupCursorCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, std::shared_ptr onCompletionGuard) noexcept; + /** + * Callback to transfer file from the sync source + */ + void _transferFileCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::size_t fileIdx, + std::shared_ptr onCompletionGuard) noexcept; + /** * This function does the following: * 1.) Truncate oplog. @@ -419,6 +437,7 @@ class InitialSyncerFCB : public InitialSyncerInterface { /** * Temporary location to declare all FCB-related private methods + * TODO: reorganize */ Status _moveFiles(const std::vector& files, const std::string& sourceDir, @@ -458,6 +477,9 @@ class InitialSyncerFCB : public InitialSyncerInterface { ThreadPool* _writerPool; // (R) StorageInterface* _storage; // (R) ReplicationProcess* _replicationProcess; // (S) + std::vector _remoteFiles; // TODO: + UUID _backupId; // TODO: + std::string _remoteDBPath; // TODO: // This is invoked with the final status of the initial sync. If startup() fails, this callback // is never invoked. The caller gets the last applied optime when the initial sync completes @@ -475,9 +497,15 @@ class InitialSyncerFCB : public InitialSyncerInterface { // Handle to currently scheduled _fetchBackupCursorCallback() task. executor::TaskExecutor::CallbackHandle _fetchBackupCursorHandle; // (M) + // Handle to currently scheduled _transferFileCallback() task. + executor::TaskExecutor::CallbackHandle _transferFileHandle; // (M) + // RollbackChecker to get rollback ID before and after each initial sync attempt. std::unique_ptr _rollbackChecker; // (M) + // Handle returned from RollbackChecker::reset(). + RollbackChecker::CallbackHandle _getBaseRollbackIdHandle; // (M) + // Handle returned from RollbackChecker::checkForRollback(). RollbackChecker::CallbackHandle _getLastRollbackIdHandle; // (M) From 01603b1069010f83dc70e1da2f5f6492a7e3aac2 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Tue, 11 Jun 2024 22:56:19 +0100 Subject: [PATCH 09/32] PSMDB-1284 Implement switching storage locations and files moving --- src/mongo/db/repl/initial_syncer_fcb.cpp | 319 +++++++++++++++++------ src/mongo/db/repl/initial_syncer_fcb.h | 35 ++- 2 files changed, 271 insertions(+), 83 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 5feaeec4bee33..398073913eb2d 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -38,6 +38,8 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include #include +#include "boost/filesystem/file_status.hpp" +#include "boost/filesystem/operations.hpp" #include #include #include @@ -52,11 +54,13 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/client/fetcher.h" #include "mongo/client/remote_command_retry_scheduler.h" #include "mongo/db/client.h" +#include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/database_name.h" #include "mongo/db/dbdirectclient.h" #include "mongo/db/feature_compatibility_version_parser.h" #include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" +#include "mongo/db/operation_context.h" #include "mongo/db/pipeline/aggregate_command_gen.h" #include "mongo/db/repl/all_database_cloner.h" #include "mongo/db/repl/fcb_file_cloner.h" @@ -207,6 +211,7 @@ InitialSyncerFCB::InitialSyncerFCB( _storage(storage), _replicationProcess(replicationProcess), _backupId(UUID::fromCDR(std::array{})), + _cfgDBPath(storageGlobalParams.dbpath), _onCompletion(onCompletion), _createClientFn( [] { return std::make_unique(true /* autoReconnect */); }) { @@ -324,6 +329,7 @@ void InitialSyncerFCB::_cancelRemainingWork_inlock() { _cancelHandle_inlock(_getBaseRollbackIdHandle); _cancelHandle_inlock(_fetchBackupCursorHandle); _cancelHandle_inlock(_transferFileHandle); + _cancelHandle_inlock(_currentHandle); _cancelHandle_inlock(_getLastRollbackIdHandle); if (_sharedData) { @@ -712,15 +718,6 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( return; } - LOGV2_DEBUG(128404, 2, "Reading the list of local files via $backupCursor"); - auto bfiles = _getBackupFiles(); - if (!bfiles.isOK()) { - LOGV2_DEBUG( - 128405, 2, "Failed to get the list of local files", "status"_attr = bfiles.getStatus()); - } - LOGV2_DEBUG( - 128406, 2, "Retrieved names of local files", "number"_attr = bfiles.getValue().size()); - _syncSource = syncSource.getValue(); // Schedule rollback ID checker. @@ -1268,22 +1265,6 @@ namespace { constexpr int kBackupCursorFileFetcherRetryAttempts = 10; -void moveFile(const std::string& src, const std::string& dst) { - LOGV2_DEBUG(128401, 1, "Moving file", "from"_attr = src, "to"_attr = dst); - - uassert(128402, - "Destination file '{}' already exists"_format(dst), - !boost::filesystem::exists(dst)); - - // Boost filesystem functions clear "ec" on success. - boost::system::error_code ec; - boost::filesystem::rename(src, dst, ec); - if (ec) { - uasserted(128403, - "Error copying file from '{}' to '{}': {}"_format(src, dst, ec.message())); - } -} - BSONObj makeBackupCursorCmd() { BSONArrayBuilder pipelineBuilder; pipelineBuilder << BSON("$backupCursor" << BSONObj()); @@ -1297,18 +1278,58 @@ AggregateCommandRequest makeBackupCursorRequest() { } // namespace -// function to move list of files from one directory to another -Status InitialSyncerFCB::_moveFiles(const std::vector& files, - const std::string& sourceDir, - const std::string& destDir) { - for (const auto& file : files) { - auto sourcePath = sourceDir + "/" + file; - auto destPath = destDir + "/" + file; - moveFile(sourcePath, destPath); +// clean local files in the dbpath +Status InitialSyncerFCB::_deleteLocalFiles() { + // list of files is in the _localFiles vector of std::string + for (const auto& path : _localFiles) { + boost::system::error_code ec; + boost::filesystem::remove(path, ec); + if (ec) { + return {ErrorCodes::InternalError, + "Error deleting file '{}': {}"_format(path, ec.message())}; + } } return Status::OK(); } +// function to move files from one directory to another +// excluding .dummy subdirectory +Status InitialSyncerFCB::_moveFiles(const boost::filesystem::path& sourceDir, + const boost::filesystem::path& destDir) { + namespace fs = boost::filesystem; + + const fs::path excluded{".dummy"}; + try { + std::vector files; + // populate files list and create directory structure under destDir + for (auto it = fs::recursive_directory_iterator(sourceDir); + it != fs::recursive_directory_iterator(); + ++it) { + if (fs::is_regular_file(it->status())) { + // TODO: filter some files + // push into the list + files.push_back(it->path()); + } else if (fs::is_directory(it->status())) { + auto relPath = fs::relative(it->path(), sourceDir); + if (excluded == relPath) { + it.disable_recursion_pending(); + } else { + fs::create_directories(destDir / relPath); + } + } + } + // move files from the list + for (const auto& sourcePath : files) { + auto destPath = destDir / fs::relative(sourcePath, sourceDir); + fs::rename(sourcePath, destPath); + } + + return Status::OK(); + } catch (const fs::filesystem_error& e) { + return Status(ErrorCodes::UnknownError, e.what()); + } +} + // Open a local backup cursor and obtain a list of files from that. StatusWith> InitialSyncerFCB::_getBackupFiles() { std::vector files; @@ -1328,37 +1349,8 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { } while (cursor->more()) { auto rec = cursor->next(); - logd("isoldbg: {}", rec.toString()); files.emplace_back(rec[kFileNameFieldName].String()); } - - // BSONObj result; - // if (client.runCommand(DatabaseName::kAdmin, makeBackupCursorCmd(), result)) { - // logd("isoldbg: $backupCursor result: {}", result.toString()); - // } else { - // logd("isoldbg: runCommand failed: {}", result.toString()); - // return Status{ErrorCodes::InternalError, "Local $backupCursor failed"}; - // } - - // Use fetcher to run aggregation on sync source - // Fetcher fetcher(_attemptExec.get(), - // host, - // aggRequest.getNamespace().db().toString(), - // aggregation_request_helper::serializeToCommandObj(aggRequest), - // fetcherCallback, - // readPrefMetadata, - // requestTimeout, /* command network timeout */ - // requestTimeout /* getMore network timeout */); - - // Status scheduleStatus = fetcher.schedule(); - // if (!scheduleStatus.isOK()) { - // return scheduleStatus; - // } - - // Status joinStatus = fetcher.join(opCtx); - // if (!joinStatus.isOK()) { - // return joinStatus; - // } } catch (const DBException& e) { return e.toStatus(); } @@ -1366,16 +1358,24 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { } // Switch storage location -Status InitialSyncerFCB::_switchStorageLocation(const std::string& newLocation) { - auto opCtx = makeOpCtx(); +Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, + const std::string& newLocation) { + boost::system::error_code ec; + boost::filesystem::create_directories(newLocation, ec); + if (ec) { + return {ErrorCodes::InternalError, + str::stream() << "Failed to create directory " << newLocation + << " Error: " << ec.message()}; + } auto lastShutdownState = - reinitializeStorageEngine(opCtx.get(), StorageEngineInitFlags{}, [&newLocation] { + reinitializeStorageEngine(opCtx, StorageEngineInitFlags{}, [&newLocation] { storageGlobalParams.dbpath = newLocation; }); if (StorageEngine::LastShutdownState::kClean != lastShutdownState) { return {ErrorCodes::InternalError, str::stream() << "Failed to switch storage location to " << newLocation}; } + LOGV2_DEBUG(128415, 1, "Switched storage location", "newLocation"_attr = newLocation); return Status::OK(); } @@ -1575,12 +1575,18 @@ void InitialSyncerFCB::_transferFileCallback( return; } } else { - // TODO: all files are cloned - close backup cursor and schedule next step - // TODO: this is temporary cancelation of initial sync for debugging reasons - onCompletionGuard->setResultAndCancelRemainingWork_inlock( - lock, - {ErrorCodes::NotImplemented, - "All files cloned; cancel FCBIS for debugging reason"}); + // TODO: all files are cloned - close backup cursor + // schedule next task + auto status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _switchToDownloadedCallback(args, onCompletionGuard); + }, + &_currentHandle, + "_switchToDownloadedCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } } } } catch (const DBException&) { @@ -1589,23 +1595,182 @@ void InitialSyncerFCB::_transferFileCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); } -void InitialSyncerFCB::_fcbisDraft() { +void InitialSyncerFCB::_switchToDownloadedCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = _checkForShutdownAndConvertStatus_inlock(callbackArgs, + "_switchToDownloadedCallback cancelled"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // Save list of files existing in dbpath. We will delete them later + LOGV2_DEBUG(128404, 2, "Reading the list of local files via $backupCursor"); + auto bfiles = _getBackupFiles(); + if (!bfiles.isOK()) { + LOGV2_DEBUG( + 128405, 2, "Failed to get the list of local files", "status"_attr = bfiles.getStatus()); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, bfiles.getStatus()); + return; + } + LOGV2_DEBUG( + 128406, 2, "Retrieved names of local files", "number"_attr = bfiles.getValue().size()); + _localFiles = bfiles.getValue(); + + auto opCtx = makeOpCtx(); + Lock::GlobalLock lk(opCtx.get(), MODE_X); // Switch storage to be pointing to the set of downloaded files - _switchStorageLocation(storageGlobalParams.dbpath + ".initialsync"); + status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } // do some cleanup // TODO: + + // schedule next task + status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _switchToDummyCallback(args, onCompletionGuard); + }, + &_currentHandle, + "_switchToDummyCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +void InitialSyncerFCB::_switchToDummyCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = + _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDummyCallback cancelled"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto opCtx = makeOpCtx(); + Lock::GlobalLock lk(opCtx.get(), MODE_X); // Switch storage to a dummy location - _switchStorageLocation(storageGlobalParams.dbpath + ".dummy"); + status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync/.dummy"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + // Delete the list of files obtained from the local backup cursor - // TODO: + status = _deleteLocalFiles(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + // Move the files from the download location to the normal dbpath - //_moveFiles(files, storageGlobalParams.dbpath + ".initialsync", storageGlobalParams.dbpath); + boost::filesystem::path cfgDBPath(_cfgDBPath); + status = _moveFiles(cfgDBPath / ".initialsync", cfgDBPath); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // schedule next task + status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _switchToDBPathCallback(args, onCompletionGuard); + }, + &_currentHandle, + "_switchToDBPathCallback"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +void InitialSyncerFCB::_switchToDBPathCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = + _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDBPathCallback cancelled"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + // TODO: should it be the same lock from the previious stage? + auto opCtx = makeOpCtx(); + Lock::GlobalLock lk(opCtx.get(), MODE_X); // Switch storage back to the normal dbpath - _switchStorageLocation(storageGlobalParams.dbpath); + status = _switchStorageLocation(opCtx.get(), _cfgDBPath); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + // TODO: release global lock here (before reconstructing prepared transactions etc) + // Reconstruct prepared transactions and other ephemera // TODO: + + // TODO: set value of _lastApplied or provide another instance of OpTimeAndWallTime + // Successfully complete initial sync + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, _lastApplied); +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); } +// template +// void InitialSyncerFCB::_(const executor::TaskExecutor::CallbackArgs& callbackArgs, +// // NOLINTNEXTLINE(*-unnecessary-value-param) +// std::shared_ptr onCompletionGuard) noexcept try { +// stdx::lock_guard lock(_mutex); +// auto status = _checkForShutdownAndConvertStatus_inlock(callbackArgs, "error message"); +// if (!status.isOK()) { +// onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); +// return; +// } +// +// // schedule next task +// status = _scheduleWorkAndSaveHandle_inlock( +// [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { +// _nextTaskCallback(args, onCompletionGuard); +// }, +// &_currentHandle, +// "task name"); +// if (!status.isOK()) { +// onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); +// return; +// } +//} catch (const DBException&) { +// // Report exception as an initial syncer failure. +// stdx::unique_lock lock(_mutex); +// onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +//} + +// debugging template +// onCompletionGuard->setResultAndCancelRemainingWork_inlock( +// lock, +// {ErrorCodes::NotImplemented, +// "All files cloned; cancel FCBIS for debugging reason"}); + + std::string InitialSyncerFCB::Stats::toString() const { return toBSON().toString(); } diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 9525d4c8d118b..1cadf85ca17b7 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -338,6 +338,24 @@ class InitialSyncerFCB : public InitialSyncerInterface { std::size_t fileIdx, std::shared_ptr onCompletionGuard) noexcept; + /** + * Switch to downloaded files and do some cleanup of the 'local' db + */ + void _switchToDownloadedCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + + /** + * Switch to dummy location, remove local files from dbpath, move downloaded files to the dbpath + */ + void _switchToDummyCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + + /** + * Switch back to dbpath, finalize and complete inital sync + */ + void _switchToDBPathCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + /** * This function does the following: * 1.) Truncate oplog. @@ -439,15 +457,14 @@ class InitialSyncerFCB : public InitialSyncerInterface { * Temporary location to declare all FCB-related private methods * TODO: reorganize */ - Status _moveFiles(const std::vector& files, - const std::string& sourceDir, - const std::string& destDir); + Status _deleteLocalFiles(); - StatusWith> _getBackupFiles(); + Status _moveFiles(const boost::filesystem::path& sourceDir, + const boost::filesystem::path& destDir); - Status _switchStorageLocation(const std::string& newLocation); + StatusWith> _getBackupFiles(); - void _fcbisDraft(); + Status _switchStorageLocation(OperationContext* opCtx, const std::string& newLocation); // Counts how many documents have been refetched from the source in the current batch. AtomicWord _fetchCount; @@ -477,9 +494,11 @@ class InitialSyncerFCB : public InitialSyncerInterface { ThreadPool* _writerPool; // (R) StorageInterface* _storage; // (R) ReplicationProcess* _replicationProcess; // (S) + std::vector _localFiles; // TODO: std::vector _remoteFiles; // TODO: UUID _backupId; // TODO: std::string _remoteDBPath; // TODO: + const std::string _cfgDBPath; // TODO: // This is invoked with the final status of the initial sync. If startup() fails, this callback // is never invoked. The caller gets the last applied optime when the initial sync completes @@ -500,6 +519,10 @@ class InitialSyncerFCB : public InitialSyncerInterface { // Handle to currently scheduled _transferFileCallback() task. executor::TaskExecutor::CallbackHandle _transferFileHandle; // (M) + // Handle to currently scheduled task (one of several tasks in the file move/dbpath change + // sequence). + executor::TaskExecutor::CallbackHandle _currentHandle; // (M) + // RollbackChecker to get rollback ID before and after each initial sync attempt. std::unique_ptr _rollbackChecker; // (M) From 613b0dbab271c7e77cff61af7f56fa4710210f72 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 17 Jun 2024 19:16:21 +0100 Subject: [PATCH 10/32] PSMDB-1284 cleanup 'local' db --- src/mongo/db/repl/SConscript | 2 ++ src/mongo/db/repl/initial_syncer_fcb.cpp | 43 ++++++++++++++++++++++-- src/mongo/db/repl/initial_syncer_fcb.h | 3 ++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index ccfa0f6c5d55f..062d66e6ed792 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1276,10 +1276,12 @@ env.Library( 'tenant_migration_access_blocker', ], LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/catalog/catalog_control', '$BUILD_DIR/mongo/db/index_builds_coordinator_interface', '$BUILD_DIR/mongo/db/server_base', '$BUILD_DIR/mongo/db/serverless/serverless_lock', '$BUILD_DIR/mongo/db/session/session_catalog_mongod', + '$BUILD_DIR/mongo/db/startup_recovery', '$BUILD_DIR/mongo/db/storage/storage_engine_common', '$BUILD_DIR/mongo/executor/scoped_task_executor', '$BUILD_DIR/mongo/util/progress_meter', diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 398073913eb2d..0dcc18d24d0c6 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -31,6 +31,8 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "initial_syncer_fcb.h" +#include +#include #include #include #include @@ -38,8 +40,6 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include #include -#include "boost/filesystem/file_status.hpp" -#include "boost/filesystem/operations.hpp" #include #include #include @@ -53,6 +53,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/client/dbclient_cursor.h" #include "mongo/client/fetcher.h" #include "mongo/client/remote_command_retry_scheduler.h" +#include "mongo/db/catalog/catalog_control.h" #include "mongo/db/client.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/database_name.h" @@ -72,6 +73,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/repl_server_parameters_gen.h" #include "mongo/db/repl/replication_auth.h" #include "mongo/db/repl/replication_consistency_markers.h" +#include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_source_selector.h" @@ -80,6 +82,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/server_options.h" #include "mongo/db/serverless/serverless_operation_lock_registry.h" #include "mongo/db/service_context.h" +#include "mongo/db/startup_recovery.h" #include "mongo/db/storage/storage_engine.h" #include "mongo/db/storage/storage_engine_init.h" #include "mongo/db/storage/storage_options.h" @@ -1367,6 +1370,9 @@ Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, str::stream() << "Failed to create directory " << newLocation << " Error: " << ec.message()}; } + + auto previousCatalogState = catalog::closeCatalog(opCtx); + auto lastShutdownState = reinitializeStorageEngine(opCtx, StorageEngineInitFlags{}, [&newLocation] { storageGlobalParams.dbpath = newLocation; @@ -1375,6 +1381,17 @@ Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, return {ErrorCodes::InternalError, str::stream() << "Failed to switch storage location to " << newLocation}; } + + + try { + startup_recovery::repairAndRecoverDatabases(opCtx, lastShutdownState); + } catch (const ExceptionFor& error) { + // versions incompatibility (we actually should check this when we select sync source) + return error.toStatus(); + } + + catalog::openCatalogAfterStorageChange(opCtx); + LOGV2_DEBUG(128415, 1, "Switched storage location", "newLocation"_attr = newLocation); return Status::OK(); } @@ -1419,6 +1436,9 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( auto checkpointTimestamp = metadata["checkpointTimestamp"].timestamp(); _backupId = UUID(uassertStatusOK(UUID::parse(metadata[kBackupIdFieldName]))); _remoteDBPath = metadata[kDBPathFieldName].String(); + auto status = OpTime::parseFromOplogEntry(metadata["oplogEnd"].Obj()); + invariant(status.isOK()); + _oplogEnd = status.getValue(); LOGV2_INFO(128409, "Opened backup cursor on sync source", @@ -1622,14 +1642,28 @@ void InitialSyncerFCB::_switchToDownloadedCallback( auto opCtx = makeOpCtx(); Lock::GlobalLock lk(opCtx.get(), MODE_X); + // retrieve the current on-disk replica set configuration + auto* rs = repl::ReplicationCoordinator::get(opCtx->getServiceContext()); + invariant(rs); + BSONObj savedRSConfig = rs->getConfigBSON(); + // Switch storage to be pointing to the set of downloaded files status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync"); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; } + // do some cleanup - // TODO: + auto* consistencyMarkers = _replicationProcess->getConsistencyMarkers(); + // TODO: when extend backup cursor is implemented use the last opTime retrieved from the sync + // source + consistencyMarkers->setOplogTruncateAfterPoint(opCtx.get(), _oplogEnd.getTimestamp()); + // clear and reset the initalSyncId + consistencyMarkers->clearInitialSyncId(opCtx.get()); + consistencyMarkers->setInitialSyncIdIfNotSet(opCtx.get()); + // TODO: replace the lastVote document with a default one + // TODO: replace the config with savedRSConfig // schedule next task status = _scheduleWorkAndSaveHandle_inlock( @@ -1728,6 +1762,9 @@ void InitialSyncerFCB::_switchToDBPathCallback( // TODO: // TODO: set value of _lastApplied or provide another instance of OpTimeAndWallTime + // TODO: fix this temporary solution + _lastApplied.opTime = _oplogEnd; + _lastApplied.wallTime = Date_t::fromMillisSinceEpoch(_oplogEnd.getSecs() * 1000); // Successfully complete initial sync onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, _lastApplied); } catch (const DBException&) { diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 1cadf85ca17b7..ab8e7cc75f274 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -38,6 +38,8 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include #include +#include + #include "mongo/base/status.h" #include "mongo/base/status_with.h" #include "mongo/base/string_data.h" @@ -498,6 +500,7 @@ class InitialSyncerFCB : public InitialSyncerInterface { std::vector _remoteFiles; // TODO: UUID _backupId; // TODO: std::string _remoteDBPath; // TODO: + OpTime _oplogEnd; // TODO: const std::string _cfgDBPath; // TODO: // This is invoked with the final status of the initial sync. If startup() fails, this callback From 108fd0f7ac51242e3a066919a57c8d8fdcd153ae Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Tue, 18 Jun 2024 15:52:09 +0100 Subject: [PATCH 11/32] PSMDB-1284 cleanup 'local' db part 2 also split out 'initial_syncer_fcb' library --- src/mongo/db/SConscript | 1 + src/mongo/db/repl/SConscript | 26 +++++++++++++++++--- src/mongo/db/repl/initial_syncer_fcb.cpp | 31 ++++++++++++++++++++---- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index c0811fe293eb7..016307ee282b2 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -2519,6 +2519,7 @@ env.Library( 'query/stats/stats', 'repl/drop_pending_collection_reaper', 'repl/initial_syncer', + 'repl/initial_syncer_fcb', 'repl/repl_coordinator_impl', 'repl/replication_recovery', 'repl/serveronly_repl', diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 062d66e6ed792..4efd99e29b384 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1251,11 +1251,9 @@ env.Library( env.Library( target='initial_syncer', source=[ - 'fcb_file_cloner.cpp', 'initial_syncer.cpp', 'initial_syncer_common_stats.cpp', 'initial_syncer_factory.cpp', - 'initial_syncer_fcb.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/client/clientdriver_network', @@ -1276,17 +1274,37 @@ env.Library( 'tenant_migration_access_blocker', ], LIBDEPS_PRIVATE=[ - '$BUILD_DIR/mongo/db/catalog/catalog_control', '$BUILD_DIR/mongo/db/index_builds_coordinator_interface', '$BUILD_DIR/mongo/db/server_base', '$BUILD_DIR/mongo/db/serverless/serverless_lock', '$BUILD_DIR/mongo/db/session/session_catalog_mongod', + '$BUILD_DIR/mongo/executor/scoped_task_executor', + 'repl_server_parameters', + ], +) + +env.Library( + target='initial_syncer_fcb', + source=[ + 'fcb_file_cloner.cpp', + 'initial_syncer_fcb.cpp', + ], + LIBDEPS=[ + 'initial_sync_cloners', + 'repl_sync_shared_data', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/catalog/catalog_control', + '$BUILD_DIR/mongo/db/index_builds_coordinator_interface', + '$BUILD_DIR/mongo/db/serverless/serverless_lock', '$BUILD_DIR/mongo/db/startup_recovery', '$BUILD_DIR/mongo/db/storage/storage_engine_common', '$BUILD_DIR/mongo/executor/scoped_task_executor', '$BUILD_DIR/mongo/util/progress_meter', - 'repl_server_parameters', + 'drop_pending_collection_reaper', + 'initial_syncer', 'replication_auth', + 'serveronly_repl', ], ) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 0dcc18d24d0c6..d2e18394a5124 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -64,6 +64,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/operation_context.h" #include "mongo/db/pipeline/aggregate_command_gen.h" #include "mongo/db/repl/all_database_cloner.h" +#include "mongo/db/repl/drop_pending_collection_reaper.h" #include "mongo/db/repl/fcb_file_cloner.h" #include "mongo/db/repl/initial_sync_state.h" #include "mongo/db/repl/initial_syncer_common_stats.h" @@ -74,6 +75,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/replication_auth.h" #include "mongo/db/repl/replication_consistency_markers.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/repl/replication_coordinator_external_state_impl.h" #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_source_selector.h" @@ -1346,8 +1348,6 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { &client, makeBackupCursorRequest(), true /* secondaryOk */, false /* useExhaust */)); if (cursor->more()) { auto metadata = cursor->next(); - // TODO: remove all logd() calls - logd("isoldbg: $backupCursor metadata: {}", metadata.toString()); files.reserve(cursor->objsLeftInBatch()); } while (cursor->more()) { @@ -1501,7 +1501,6 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( _backupCursorFetcher->onCompletion() .thenRunOn(**_attemptExec) .then([this, fetchStatus, onCompletionGuard, &lock] { - logd("Backup cursor fetcher completion callback"); if (!*fetchStatus) { // the callback was never invoked uasserted(128411, "Internal error running cursor callback in command"); @@ -1662,8 +1661,30 @@ void InitialSyncerFCB::_switchToDownloadedCallback( // clear and reset the initalSyncId consistencyMarkers->clearInitialSyncId(opCtx.get()); consistencyMarkers->setInitialSyncIdIfNotSet(opCtx.get()); - // TODO: replace the lastVote document with a default one - // TODO: replace the config with savedRSConfig + + ReplicationCoordinatorExternalStateImpl externalState( + opCtx->getServiceContext(), + DropPendingCollectionReaper::get(opCtx.get()), + StorageInterface::get(opCtx.get()), + ReplicationProcess::get(opCtx.get())); + // replace the lastVote document with a default one + status = StorageInterface::get(opCtx.get()) + ->dropCollection(opCtx.get(), NamespaceString::kLastVoteNamespace); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + status = externalState.createLocalLastVoteCollection(opCtx.get()); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + // replace the config with savedRSConfig + status = externalState.replaceLocalConfigDocument(opCtx.get(), savedRSConfig); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } // schedule next task status = _scheduleWorkAndSaveHandle_inlock( From 501d7b92b2b7727975e335c5ea2210fc4f18ad86 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Tue, 18 Jun 2024 16:16:13 +0100 Subject: [PATCH 12/32] PSMDB-1284 cleanup comments about reconstruct transactions reconstruct prepared transactions happens in _tearDown_inlock --- src/mongo/db/repl/initial_syncer_fcb.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index d2e18394a5124..3f13513a920fe 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1777,10 +1777,6 @@ void InitialSyncerFCB::_switchToDBPathCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; } - // TODO: release global lock here (before reconstructing prepared transactions etc) - - // Reconstruct prepared transactions and other ephemera - // TODO: // TODO: set value of _lastApplied or provide another instance of OpTimeAndWallTime // TODO: fix this temporary solution From ea20d651d5cfd7a2bb30ebb9bafc54e5e3e31712 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 21 Jun 2024 18:49:19 +0100 Subject: [PATCH 13/32] PSMDB-1284 kill backup cursor on the sync source --- src/mongo/db/repl/initial_syncer_fcb.cpp | 41 ++++++++++++++++++++++-- src/mongo/db/repl/initial_syncer_fcb.h | 4 +++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 3f13513a920fe..c4e6b5d191e2f 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -92,6 +92,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/executor/task_executor.h" #include "mongo/logv2/log.h" #include "mongo/platform/compiler.h" // IWYU pragma: keep +#include "mongo/rpc/get_status_from_command_result.h" #include "mongo/stdx/mutex.h" #include "mongo/util/assert_util.h" #include "mongo/util/destructor_guard.h" @@ -596,6 +597,7 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( _opts.resetOptimes(); _lastApplied = {OpTime(), Date_t()}; _lastFetched = {}; + _backupCursorInfo.reset(); LOGV2_DEBUG( 21167, 2, "Resetting the oldest timestamp before starting this initial sync attempt"); @@ -1396,6 +1398,34 @@ Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, return Status::OK(); } +Status InitialSyncerFCB::_killBackupCursor_inlock() { + const auto* info = _backupCursorInfo.get(); + invariant(info); + executor::RemoteCommandRequest killCursorsRequest( + _syncSource, + info->nss.db().toString(), + BSON("killCursors" << info->nss.coll().toString() << "cursors" + << BSON_ARRAY(info->cursorId)), + nullptr); + + auto scheduleResult = _exec->scheduleRemoteCommand( + killCursorsRequest, [](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) { + if (!args.response.isOK()) { + LOGV2_WARNING(128416, + "killCursors command task failed", + "error"_attr = redact(args.response.status)); + return; + } + auto status = getStatusFromCommandResult(args.response.data); + if (status.isOK()) { + LOGV2_INFO(128417, "Killed backup cursor"); + } else { + LOGV2_WARNING(128418, "killCursors command failed", "error"_attr = redact(status)); + } + }); + return scheduleResult.getStatus(); +} + // TenantMigrationRecipientService::Instance::_openBackupCursor // ShardMergeRecipientService::Instance::_openBackupCursor void InitialSyncerFCB::_fetchBackupCursorCallback( @@ -1439,6 +1469,8 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( auto status = OpTime::parseFromOplogEntry(metadata["oplogEnd"].Obj()); invariant(status.isOK()); _oplogEnd = status.getValue(); + _backupCursorInfo = std::make_unique( + data.cursorId, data.nss, checkpointTimestamp); LOGV2_INFO(128409, "Opened backup cursor on sync source", @@ -1594,9 +1626,14 @@ void InitialSyncerFCB::_transferFileCallback( return; } } else { - // TODO: all files are cloned - close backup cursor + // all files are cloned - close backup cursor + auto status = _killBackupCursor_inlock(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } // schedule next task - auto status = _scheduleWorkAndSaveHandle_inlock( + status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { _switchToDownloadedCallback(args, onCompletionGuard); }, diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index ab8e7cc75f274..502e25a1f24e7 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -55,6 +55,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/multiapplier.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/rollback_checker.h" +#include "mongo/db/repl/tenant_migration_shared_data.h" #include "mongo/executor/scoped_task_executor.h" #include "mongo/executor/task_executor.h" #include "mongo/platform/atomic_word.h" @@ -468,6 +469,8 @@ class InitialSyncerFCB : public InitialSyncerInterface { Status _switchStorageLocation(OperationContext* opCtx, const std::string& newLocation); + Status _killBackupCursor_inlock(); + // Counts how many documents have been refetched from the source in the current batch. AtomicWord _fetchCount; @@ -502,6 +505,7 @@ class InitialSyncerFCB : public InitialSyncerInterface { std::string _remoteDBPath; // TODO: OpTime _oplogEnd; // TODO: const std::string _cfgDBPath; // TODO: + std::unique_ptr _backupCursorInfo; // TODO: // This is invoked with the final status of the initial sync. If startup() fails, this callback // is never invoked. The caller gets the last applied optime when the initial sync completes From 6281d259cbed053cd039d643fcf238eec4be1e37 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 24 Jun 2024 16:16:07 +0100 Subject: [PATCH 14/32] PSMDB_1284 kill local backup cursor --- src/mongo/db/repl/initial_syncer_fcb.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index c4e6b5d191e2f..e1e8563d34e36 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1356,6 +1356,9 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { auto rec = cursor->next(); files.emplace_back(rec[kFileNameFieldName].String()); } + + // Close cursor + cursor->kill(); } catch (const DBException& e) { return e.toStatus(); } From 7228c33c519d797e8faf737dfa31d75cf5107878 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 1 Jul 2024 16:13:52 +0100 Subject: [PATCH 15/32] PSMDB-1284 make FCBIS pro-build feature --- SConstruct | 10 ++++++++++ src/mongo/SConscript | 1 + src/mongo/config.h.in | 3 +++ src/mongo/db/SConscript | 2 +- src/mongo/db/repl/SConscript | 1 - src/mongo/db/repl/replication_coordinator_impl.cpp | 12 ++++++++++++ 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/SConstruct b/SConstruct index a18dba89ab79a..2a42630e556cf 100644 --- a/SConstruct +++ b/SConstruct @@ -296,6 +296,12 @@ add_option( nargs=0, ) +add_option( + 'enable-fcbis', + help='Enable file copy-based initial sync', + nargs=0, +) + add_option( 'full-featured', help='Enable all optional features', @@ -2716,6 +2722,10 @@ if has_option('enable-fipsmode') or has_option('full-featured'): env.SetConfigHeaderDefine("PERCONA_FIPSMODE_ENABLED") env['PSMDB_PRO_FEATURES'].append('FIPSMode') +if has_option('enable-fcbis') or has_option('full-featured'): + env.SetConfigHeaderDefine("PERCONA_FCBIS_ENABLED") + env['PSMDB_PRO_FEATURES'].append('FCBIS') + env.Tool('forceincludes') # ---- other build setup ----- diff --git a/src/mongo/SConscript b/src/mongo/SConscript index 830e4f8a973ab..be39aee0e2315 100644 --- a/src/mongo/SConscript +++ b/src/mongo/SConscript @@ -77,6 +77,7 @@ config_header_substs = ( ('@mongo_config_wiredtiger_enabled@', 'MONGO_CONFIG_WIREDTIGER_ENABLED'), ('@mongo_config_grpc@', 'MONGO_CONFIG_GRPC'), ('@percona_fipsmode_enabled@', 'PERCONA_FIPSMODE_ENABLED'), + ('@percona_fcbis_enabled@', 'PERCONA_FCBIS_ENABLED'), ) diff --git a/src/mongo/config.h.in b/src/mongo/config.h.in index 05ab976f29b16..52109bb230e1b 100644 --- a/src/mongo/config.h.in +++ b/src/mongo/config.h.in @@ -112,3 +112,6 @@ // FIPSMode enabled @percona_fipsmode_enabled@ + +// FCBIS enabled +@percona_fcbis_enabled@ diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 016307ee282b2..9e403f65f09ef 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -2519,7 +2519,7 @@ env.Library( 'query/stats/stats', 'repl/drop_pending_collection_reaper', 'repl/initial_syncer', - 'repl/initial_syncer_fcb', + 'repl/initial_syncer_fcb' if has_option('enable-fcbis') or has_option('full-featured') else [], 'repl/repl_coordinator_impl', 'repl/replication_recovery', 'repl/serveronly_repl', diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 4efd99e29b384..1bbd4d0e07c42 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1290,7 +1290,6 @@ env.Library( 'initial_syncer_fcb.cpp', ], LIBDEPS=[ - 'initial_sync_cloners', 'repl_sync_shared_data', ], LIBDEPS_PRIVATE=[ diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 4f9ec888887de..edb94e9c1ab5f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -786,6 +786,18 @@ void ReplicationCoordinatorImpl::_startInitialSync( "logical initial sync.", "initialSyncMethod"_attr = initialSyncMethod, "error"_attr = swInitialSyncer.getStatus().reason()); + if (initialSyncMethod == "fileCopyBased") { + LOGV2_WARNING( + 128490, + "Support for the file copy-based intial sync (FCBIS) is available in " + "the Percona Supported Builds of MongoDB. You can compile Percona " + "Server for MongoDB with FCBIS yourself by following the build from " + "source guide " + "(https://docs.percona.com/percona-server-for-mongodb/7.0/install/" + "source.html). You can also subscribe to support to receive Percona " + "Supported Builds, see (https://www.percona.com/services/support) for " + "more information."); + } swInitialSyncer = createInitialSyncer(std::string("logical")); } initialSyncerCopy = uassertStatusOK(swInitialSyncer); From f77bf4ec27c810407506d6886cad461508d30b96 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 29 Jul 2024 17:34:46 +0100 Subject: [PATCH 16/32] PSMDB-1284 make logv2 ids unique --- src/mongo/db/repl/initial_syncer_fcb.cpp | 59 ++++++++++++------------ 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index e1e8563d34e36..ee613aecc6779 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -316,13 +316,13 @@ Status InitialSyncerFCB::shutdown() { void InitialSyncerFCB::cancelCurrentAttempt() { stdx::lock_guard lk(_mutex); if (_isActive_inlock()) { - LOGV2_DEBUG(4427201, + LOGV2_DEBUG(128419, 1, "Cancelling the current initial sync attempt.", "currentAttempt"_attr = _stats.failedInitialSyncAttempts + 1); _cancelRemainingWork_inlock(); } else { - LOGV2_DEBUG(4427202, + LOGV2_DEBUG(128420, 1, "There is no initial sync attempt to cancel because the initial syncer is not " "currently active."); @@ -484,7 +484,7 @@ BSONObj InitialSyncerFCB::_getInitialSyncProgress_inlock() const { } return bob.obj(); } catch (const DBException& e) { - LOGV2(21161, + LOGV2(128421, "Error creating initial sync progress object: {error}", "Error creating initial sync progress object", "error"_attr = e.toString()); @@ -543,7 +543,7 @@ void InitialSyncerFCB::_tearDown_inlock(OperationContext* opCtx, invariant(currentLastAppliedOpTime == lastAppliedOpTime); } - LOGV2(21163, + LOGV2(128422, "initial sync done; took " "{duration}.", "Initial sync done", @@ -569,7 +569,7 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( return; } - LOGV2(21164, + LOGV2(128423, "Starting initial sync (attempt {initialSyncAttempt} of {initialSyncMaxAttempts})", "Starting initial sync attempt", "initialSyncAttempt"_attr = (initialSyncAttempt + 1), @@ -589,18 +589,19 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( // has to run outside lock. stdx::lock_guard lock(_mutex); - LOGV2_DEBUG( - 21165, 2, "Resetting sync source so a new one can be chosen for this initial sync attempt"); + LOGV2_DEBUG(128424, + 2, + "Resetting sync source so a new one can be chosen for this initial sync attempt"); _syncSource = HostAndPort(); - LOGV2_DEBUG(21166, 2, "Resetting all optimes before starting this initial sync attempt"); + LOGV2_DEBUG(128425, 2, "Resetting all optimes before starting this initial sync attempt"); _opts.resetOptimes(); _lastApplied = {OpTime(), Date_t()}; _lastFetched = {}; _backupCursorInfo.reset(); LOGV2_DEBUG( - 21167, 2, "Resetting the oldest timestamp before starting this initial sync attempt"); + 128426, 2, "Resetting the oldest timestamp before starting this initial sync attempt"); auto* storageEngine = getGlobalServiceContext()->getStorageEngine(); if (storageEngine) { // Set the oldestTimestamp to one because WiredTiger does not allow us to set it to zero @@ -610,7 +611,7 @@ void InitialSyncerFCB::_startInitialSyncAttemptCallback( storageEngine->setOldestTimestamp(kTimestampOne, true /*force*/); } - LOGV2_DEBUG(21168, + LOGV2_DEBUG(128427, 2, "Resetting feature compatibility version to last-lts. If the sync source is in " "latest feature compatibility version, we will find out when we clone the " @@ -642,7 +643,7 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( std::uint32_t chooseSyncSourceMaxAttempts, std::shared_ptr onCompletionGuard) noexcept try { if (MONGO_unlikely(initialSyncHangBeforeChoosingSyncSourceFCB.shouldFail())) { - LOGV2(5284800, "initialSyncHangBeforeChoosingSyncSourceFCB fail point enabled"); + LOGV2(128428, "initialSyncHangBeforeChoosingSyncSourceFCB fail point enabled"); initialSyncHangBeforeChoosingSyncSourceFCB.pauseWhileSet(); } @@ -675,7 +676,7 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( } auto when = (*_attemptExec)->now() + _opts.syncSourceRetryWait; - LOGV2_DEBUG(21169, + LOGV2_DEBUG(128429, 1, "Error getting sync source: '{error}', trying again in " "{syncSourceRetryWait} at {retryTime}. Attempt {chooseSyncSourceAttempt} of " @@ -705,7 +706,7 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( if (MONGO_unlikely(initialSyncHangBeforeCreatingOplogFCB.shouldFail())) { // This log output is used in js tests so please leave it. - LOGV2(21170, + LOGV2(128430, "initial sync - initialSyncHangBeforeCreatingOplogFCB fail point " "enabled. Blocking until fail point is disabled."); lock.unlock(); @@ -747,7 +748,7 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( // TODO: we probably don't need this in FCBIS Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { // truncate oplog; drop user databases. - LOGV2_DEBUG(4540700, + LOGV2_DEBUG(128431, 1, "About to truncate the oplog, if it exists, ns:{namespace}, and drop all " "user databases (so that we can clone them).", @@ -764,20 +765,20 @@ Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { UnreplicatedWritesBlock unreplicatedWritesBlock(opCtx.get()); // 1.) Truncate the oplog. - LOGV2_DEBUG(4540701, + LOGV2_DEBUG(128432, 2, "Truncating the existing oplog: {namespace}", "Truncating the existing oplog", logAttrs(NamespaceString::kRsOplogNamespace)); Timer timer; auto status = _storage->truncateCollection(opCtx.get(), NamespaceString::kRsOplogNamespace); - LOGV2(21173, + LOGV2(128433, "Initial syncer oplog truncation finished in: {durationMillis}ms", "Initial syncer oplog truncation finished", "durationMillis"_attr = timer.millis()); if (!status.isOK()) { // 1a.) Create the oplog. - LOGV2_DEBUG(4540702, + LOGV2_DEBUG(128434, 2, "Creating the oplog: {namespace}", "Creating the oplog", @@ -793,7 +794,7 @@ Status InitialSyncerFCB::_truncateOplogAndDropReplicatedDatabases() { ->abortAllIndexBuildsForInitialSync(opCtx.get(), "Aborting index builds for initial sync"); // 2b.) Drop user databases. - LOGV2_DEBUG(21175, 2, "Dropping user databases"); + LOGV2_DEBUG(128435, 2, "Dropping user databases"); return _storage->dropReplicatedDatabases(opCtx.get()); } @@ -887,7 +888,7 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWithbeginFetchingTimestamp.toBSON()); invariant(!result.getValue().documents.empty()); - LOGV2_DEBUG(4431600, + LOGV2_DEBUG(128437, 2, "Setting begin applying timestamp to {beginApplyingTimestamp}, ns: " "{namespace} and the begin fetching timestamp to {beginFetchingTimestamp}", @@ -942,7 +943,7 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWithscheduleWork( [=](const mongo::executor::TaskExecutor::CallbackArgs&) { _finishCallback(result); }); if (!scheduleResult.isOK()) { - LOGV2_WARNING(21197, + LOGV2_WARNING(128439, "Unable to schedule initial syncer completion task due to " "{error}. Running callback on current thread.", "Unable to schedule initial syncer completion task. Running callback on " @@ -985,7 +986,7 @@ void InitialSyncerFCB::_finishInitialSyncAttempt(const StatusWith lock(_mutex); @@ -1002,7 +1003,7 @@ void InitialSyncerFCB::_finishInitialSyncAttempt(const StatusWith lastApplied if (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail())) { // This log output is used in js tests so please leave it. - LOGV2(21194, + LOGV2(128444, "initial sync - initialSyncHangBeforeFinishFCB fail point " "enabled. Blocking until fail point is disabled."); while (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail()) && !_isShuttingDown()) { @@ -1113,7 +1114,7 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied try { onCompletion(lastApplied); } catch (...) { - LOGV2_WARNING(21198, + LOGV2_WARNING(128445, "initial syncer finish callback threw exception: {error}", "Initial syncer finish callback threw exception", "error"_attr = redact(exceptionToStatus())); @@ -1144,7 +1145,7 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied } if (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail())) { - LOGV2(5825800, + LOGV2(128446, "initial sync finished - initialSyncHangAfterFinishFCB fail point " "enabled. Blocking until fail point is disabled."); while (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail()) && !_isShuttingDown()) { From 20f6edbfa5ef2c8ca3e6c5764bfbb2d8208868b2 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 29 Jul 2024 17:17:31 +0100 Subject: [PATCH 17/32] PSMDB-1477 Do not assert on initial sync attempt failure --- src/mongo/db/repl/initial_syncer_fcb.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index ee613aecc6779..c2be1c8ec09ee 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1541,14 +1541,18 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( // the callback was never invoked uasserted(128411, "Internal error running cursor callback in command"); } - uassertStatusOK(fetchStatus->get()); + auto status = fetchStatus->get(); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } uassert(128414, "Internal error: no file names collected from sync source", !_remoteFiles.empty()); // schedule file transfer callback - auto status = _scheduleWorkAndSaveHandle_inlock( + status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { _transferFileCallback(args, 0lu, onCompletionGuard); }, From be543b7ce3921d3b736d0df7d557541a3417469a Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Wed, 14 Aug 2024 20:35:33 +0100 Subject: [PATCH 18/32] PSMDB-1477 Ensure fallback to logical by returning InvalidSyncSource --- src/mongo/db/repl/initial_syncer_fcb.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index c2be1c8ec09ee..f6d9dbead3be8 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1511,6 +1511,12 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( LOGV2_ERROR( 128408, "Error fetching backup cursor entries", "error"_attr = ex.toString()); *fetchStatus = ex.toStatus(); + // In case of following error: + // "Location50886: The existing backup cursor must be closed before $backupCursor can + // succeed." replace error code with InvalidSyncSource to ensure fallback to logical + if (fetchStatus->get().code() == 50886) { + *fetchStatus = Status{ErrorCodes::InvalidSyncSource, ex.reason()}; + } } }; From 5e1c7fdbff48ff0de9a476e5f55d5a47e06c2fb2 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Wed, 14 Aug 2024 20:36:24 +0100 Subject: [PATCH 19/32] PSMDB-1477 fix method name in file copy-based initial syncer --- src/mongo/db/repl/initial_syncer_fcb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index f6d9dbead3be8..124bfe422689d 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -248,7 +248,7 @@ bool InitialSyncerFCB::_isActive_inlock() const { } std::string InitialSyncerFCB::getInitialSyncMethod() const { - return "logical"; + return "fileCopyBased"; } Status InitialSyncerFCB::startup(OperationContext* opCtx, From 103c78ee08716a325705e6f059a7f96c98f86140 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Mon, 26 Aug 2024 18:27:34 +0100 Subject: [PATCH 20/32] PSMDB-1479 Do all transformations under single global lock Fix of the "Fatal message" happened in OplogCapMaintainerThread. Both switch to dummy location and back to dbpath location should be under single global lock. --- src/mongo/db/repl/initial_syncer_fcb.cpp | 37 +++++++++++------------- src/mongo/db/repl/initial_syncer_fcb.h | 13 +++++---- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 124bfe422689d..a77146f960d35 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1740,10 +1740,10 @@ void InitialSyncerFCB::_switchToDownloadedCallback( // schedule next task status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { - _switchToDummyCallback(args, onCompletionGuard); + _switchToDummyToDBPathCallback(args, onCompletionGuard); }, &_currentHandle, - "_switchToDummyCallback"); + "_switchToDummyToDBPathCallback"); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1754,13 +1754,13 @@ void InitialSyncerFCB::_switchToDownloadedCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); } -void InitialSyncerFCB::_switchToDummyCallback( +void InitialSyncerFCB::_switchToDummyToDBPathCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { stdx::lock_guard lock(_mutex); - auto status = - _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDummyCallback cancelled"); + auto status = _checkForShutdownAndConvertStatus_inlock( + callbackArgs, "_switchToDummyToDBPathCallback cancelled"); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1790,13 +1790,20 @@ void InitialSyncerFCB::_switchToDummyCallback( return; } + // Switch storage back to the normal dbpath + status = _switchStorageLocation(opCtx.get(), _cfgDBPath); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + // schedule next task status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { - _switchToDBPathCallback(args, onCompletionGuard); + _finalizeAndCompleteCallback(args, onCompletionGuard); }, &_currentHandle, - "_switchToDBPathCallback"); + "_finalizeAndCompleteCallback"); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1807,23 +1814,13 @@ void InitialSyncerFCB::_switchToDummyCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); } -void InitialSyncerFCB::_switchToDBPathCallback( +void InitialSyncerFCB::_finalizeAndCompleteCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { stdx::lock_guard lock(_mutex); - auto status = - _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDBPathCallback cancelled"); - if (!status.isOK()) { - onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); - return; - } - - // TODO: should it be the same lock from the previious stage? - auto opCtx = makeOpCtx(); - Lock::GlobalLock lk(opCtx.get(), MODE_X); - // Switch storage back to the normal dbpath - status = _switchStorageLocation(opCtx.get(), _cfgDBPath); + auto status = _checkForShutdownAndConvertStatus_inlock( + callbackArgs, "_finalizeAndCompleteCallback cancelled"); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 502e25a1f24e7..9a808f945a3c8 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -349,15 +349,18 @@ class InitialSyncerFCB : public InitialSyncerInterface { /** * Switch to dummy location, remove local files from dbpath, move downloaded files to the dbpath + * Switch back to dbpath */ - void _switchToDummyCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, - std::shared_ptr onCompletionGuard) noexcept; + void _switchToDummyToDBPathCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; /** - * Switch back to dbpath, finalize and complete inital sync + * Finalize and complete inital sync */ - void _switchToDBPathCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, - std::shared_ptr onCompletionGuard) noexcept; + void _finalizeAndCompleteCallback( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; /** * This function does the following: From 2ce857ef1d1d5f45ad70fc07f5c908cb66597aec Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Wed, 18 Sep 2024 18:29:30 +0100 Subject: [PATCH 21/32] PSMDB-1478 use correct recovery mode after changing storage location --- src/mongo/db/repl/initial_syncer_fcb.cpp | 27 ++++++++++++++++-------- src/mongo/db/repl/initial_syncer_fcb.h | 7 +++++- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index a77146f960d35..d51d8922d3017 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1367,8 +1367,10 @@ StatusWith> InitialSyncerFCB::_getBackupFiles() { } // Switch storage location -Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, - const std::string& newLocation) { +Status InitialSyncerFCB::_switchStorageLocation( + OperationContext* opCtx, + const std::string& newLocation, + const boost::optional recoveryMode) { boost::system::error_code ec; boost::filesystem::create_directories(newLocation, ec); if (ec) { @@ -1389,11 +1391,15 @@ Status InitialSyncerFCB::_switchStorageLocation(OperationContext* opCtx, } - try { - startup_recovery::repairAndRecoverDatabases(opCtx, lastShutdownState); - } catch (const ExceptionFor& error) { - // versions incompatibility (we actually should check this when we select sync source) - return error.toStatus(); + if (recoveryMode) { + // We need to run startup recovery in the specified mode. + // This is necessary to ensure that the storage engine is in a consistent state. + try { + startup_recovery::runStartupRecoveryInMode(opCtx, lastShutdownState, *recoveryMode); + } catch (const ExceptionFor& error) { + // versions incompatibility (we actually should check this when we select sync source) + return error.toStatus(); + } } catalog::openCatalogAfterStorageChange(opCtx); @@ -1698,7 +1704,9 @@ void InitialSyncerFCB::_switchToDownloadedCallback( BSONObj savedRSConfig = rs->getConfigBSON(); // Switch storage to be pointing to the set of downloaded files - status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync"); + status = _switchStorageLocation(opCtx.get(), + _cfgDBPath + "/.initialsync", + startup_recovery::StartupRecoveryMode::kReplicaSetMember); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1791,7 +1799,8 @@ void InitialSyncerFCB::_switchToDummyToDBPathCallback( } // Switch storage back to the normal dbpath - status = _switchStorageLocation(opCtx.get(), _cfgDBPath); + status = _switchStorageLocation( + opCtx.get(), _cfgDBPath, startup_recovery::StartupRecoveryMode::kReplicaSetMember); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 9a808f945a3c8..529746da80649 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -39,6 +39,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include #include +#include #include "mongo/base/status.h" #include "mongo/base/status_with.h" @@ -56,6 +57,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/optime.h" #include "mongo/db/repl/rollback_checker.h" #include "mongo/db/repl/tenant_migration_shared_data.h" +#include "mongo/db/startup_recovery.h" #include "mongo/executor/scoped_task_executor.h" #include "mongo/executor/task_executor.h" #include "mongo/platform/atomic_word.h" @@ -470,7 +472,10 @@ class InitialSyncerFCB : public InitialSyncerInterface { StatusWith> _getBackupFiles(); - Status _switchStorageLocation(OperationContext* opCtx, const std::string& newLocation); + Status _switchStorageLocation( + OperationContext* opCtx, + const std::string& newLocation, + boost::optional = boost::none); Status _killBackupCursor_inlock(); From 750052df3866bc37142498cd39fea55c0249582a Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Wed, 18 Sep 2024 19:34:17 +0100 Subject: [PATCH 22/32] PSMDB-1478 add missing NotifyStartupComplete after changing storage location --- src/mongo/db/repl/initial_syncer_fcb.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index d51d8922d3017..27836b2e4f75f 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1385,6 +1385,7 @@ Status InitialSyncerFCB::_switchStorageLocation( reinitializeStorageEngine(opCtx, StorageEngineInitFlags{}, [&newLocation] { storageGlobalParams.dbpath = newLocation; }); + opCtx->getServiceContext()->getStorageEngine()->notifyStartupComplete(); if (StorageEngine::LastShutdownState::kClean != lastShutdownState) { return {ErrorCodes::InternalError, str::stream() << "Failed to switch storage location to " << newLocation}; From 7158ffb0dd2a20e2dc40ec31c7d0d2f8e5bfe603 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 20 Sep 2024 01:07:54 +0100 Subject: [PATCH 23/32] PSMDB-1478 Execute recovery on first start from backup --- src/mongo/db/repl/initial_syncer_fcb.cpp | 62 +++++++++++++++++++++++- src/mongo/db/repl/initial_syncer_fcb.h | 8 +++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 27836b2e4f75f..867aaa8df5aa4 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -58,6 +58,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/database_name.h" #include "mongo/db/dbdirectclient.h" +#include "mongo/db/dbhelpers.h" #include "mongo/db/feature_compatibility_version_parser.h" #include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" @@ -82,6 +83,7 @@ Copyright (C) 2024-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/repl/tenant_migration_access_blocker_util.h" #include "mongo/db/repl/transaction_oplog_application.h" #include "mongo/db/server_options.h" +#include "mongo/db/server_recovery.h" #include "mongo/db/serverless/serverless_operation_lock_registry.h" #include "mongo/db/service_context.h" #include "mongo/db/startup_recovery.h" @@ -1382,8 +1384,9 @@ Status InitialSyncerFCB::_switchStorageLocation( auto previousCatalogState = catalog::closeCatalog(opCtx); auto lastShutdownState = - reinitializeStorageEngine(opCtx, StorageEngineInitFlags{}, [&newLocation] { + reinitializeStorageEngine(opCtx, StorageEngineInitFlags{}, [&newLocation, opCtx] { storageGlobalParams.dbpath = newLocation; + repl::clearLocalOplogPtr(opCtx->getServiceContext()); }); opCtx->getServiceContext()->getStorageEngine()->notifyStartupComplete(); if (StorageEngine::LastShutdownState::kClean != lastShutdownState) { @@ -1715,6 +1718,8 @@ void InitialSyncerFCB::_switchToDownloadedCallback( // do some cleanup auto* consistencyMarkers = _replicationProcess->getConsistencyMarkers(); + consistencyMarkers->setMinValid(opCtx.get(), + OpTime{kTimestampOne, repl::OpTime::kUninitializedTerm}); // TODO: when extend backup cursor is implemented use the last opTime retrieved from the sync // source consistencyMarkers->setOplogTruncateAfterPoint(opCtx.get(), _oplogEnd.getTimestamp()); @@ -1746,6 +1751,61 @@ void InitialSyncerFCB::_switchToDownloadedCallback( return; } + // schedule next task + status = _scheduleWorkAndSaveHandle_inlock( + [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { + _executeRecovery(args, onCompletionGuard); + }, + &_currentHandle, + "_executeRecovery"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } +} catch (const DBException&) { + // Report exception as an initial syncer failure. + stdx::unique_lock lock(_mutex); + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, exceptionToStatus()); +} + +void InitialSyncerFCB::_executeRecovery( + const executor::TaskExecutor::CallbackArgs& callbackArgs, + // NOLINTNEXTLINE(*-unnecessary-value-param) + std::shared_ptr onCompletionGuard) noexcept try { + stdx::lock_guard lock(_mutex); + auto status = + _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_executeRecovery cancelled"); + if (!status.isOK()) { + onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); + return; + } + + auto opCtx = makeOpCtx(); + auto* serviceCtx = opCtx->getServiceContext(); + inReplicationRecovery(serviceCtx) = true; + ON_BLOCK_EXIT([serviceCtx] { + inReplicationRecovery(serviceCtx) = false; + }); + + _replicationProcess->getReplicationRecovery()->recoverFromOplogAsStandalone(opCtx.get(), true); + + // Aborts all active, two-phase index builds. + [[maybe_unused]] auto stoppedIndexBuilds = + IndexBuildsCoordinator::get(serviceCtx)->stopIndexBuildsForRollback(opCtx.get()); + + if (!stoppedIndexBuilds.empty()) { + LOGV2_WARNING(128498, + "Aborted active index builds during initial sync recovery", + "numIndexBuilds"_attr = stoppedIndexBuilds.size()); + } + + // Set stable timestamp + if (BSONObj lastEntry; + Helpers::getLast(opCtx.get(), NamespaceString::kRsOplogNamespace, lastEntry)) { + auto lastTime = repl::OpTimeAndWallTime::parse(lastEntry); + _storage->setStableTimestamp(serviceCtx, lastTime.opTime.getTimestamp()); + } + // schedule next task status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) { diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 529746da80649..0d2ebdcc015ce 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -349,6 +349,14 @@ class InitialSyncerFCB : public InitialSyncerInterface { void _switchToDownloadedCallback(const executor::TaskExecutor::CallbackArgs& callbackArgs, std::shared_ptr onCompletionGuard) noexcept; + /** + * Replay the oplog on the instance recoverd from backup + * Scheduled from _switchToDownloadedCallback + * Schedules _switchToDummyToDBPathCallback + */ + void _executeRecovery(const executor::TaskExecutor::CallbackArgs& callbackArgs, + std::shared_ptr onCompletionGuard) noexcept; + /** * Switch to dummy location, remove local files from dbpath, move downloaded files to the dbpath * Switch back to dbpath From cb9ce672333ea83f36292c4e3365e6b1da4aad1d Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Sat, 12 Oct 2024 18:02:10 +0100 Subject: [PATCH 24/32] PSMDB-1522 Prevent storage change deadlock by getInitialSyncProgress The thread calling getInitialSyncProgress keeps opCtx and locks trying to acquire initial syncer's mutex. This leaded to deadlock due to fact that storage changing thread waited for destruction of opCtx while keeping initial syncer's mutex --- src/mongo/db/repl/initial_syncer_fcb.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 867aaa8df5aa4..2070bf1fc0554 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1679,7 +1679,7 @@ void InitialSyncerFCB::_switchToDownloadedCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { - stdx::lock_guard lock(_mutex); + stdx::unique_lock lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDownloadedCallback cancelled"); if (!status.isOK()) { @@ -1708,9 +1708,11 @@ void InitialSyncerFCB::_switchToDownloadedCallback( BSONObj savedRSConfig = rs->getConfigBSON(); // Switch storage to be pointing to the set of downloaded files + lock.unlock(); status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync", startup_recovery::StartupRecoveryMode::kReplicaSetMember); + lock.lock(); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1827,7 +1829,7 @@ void InitialSyncerFCB::_switchToDummyToDBPathCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { - stdx::lock_guard lock(_mutex); + stdx::unique_lock lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock( callbackArgs, "_switchToDummyToDBPathCallback cancelled"); if (!status.isOK()) { @@ -1838,7 +1840,9 @@ void InitialSyncerFCB::_switchToDummyToDBPathCallback( auto opCtx = makeOpCtx(); Lock::GlobalLock lk(opCtx.get(), MODE_X); // Switch storage to a dummy location + lock.unlock(); status = _switchStorageLocation(opCtx.get(), _cfgDBPath + "/.initialsync/.dummy"); + lock.lock(); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; @@ -1860,8 +1864,10 @@ void InitialSyncerFCB::_switchToDummyToDBPathCallback( } // Switch storage back to the normal dbpath + lock.unlock(); status = _switchStorageLocation( opCtx.get(), _cfgDBPath, startup_recovery::StartupRecoveryMode::kReplicaSetMember); + lock.lock(); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; From a3259f3a6e5871f0e8daeed569bd8ca7d22b7397 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Sat, 12 Oct 2024 18:11:08 +0100 Subject: [PATCH 25/32] PSMDB-1522 In case of terminal shutdown wait for end of storage change Initial syncer's shutdown() function is called twice: first time when shutdown thread has not created opCtx yet. This allows us to wait for the end of storage change operation and to ensure that no storage change will be running whne shutdown() is called second time (with non-NOOP opCtx already created). This is done to avoid deadlock between shutdown thread and storage change. --- src/mongo/db/repl/initial_syncer_fcb.cpp | 10 +++++++- src/mongo/db/repl/initial_syncer_fcb.h | 31 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 2070bf1fc0554..907d4f7541a1e 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -295,7 +295,7 @@ Status InitialSyncerFCB::startup(OperationContext* opCtx, } Status InitialSyncerFCB::shutdown() { - stdx::lock_guard lock(_mutex); + stdx::unique_lock lock(_mutex); switch (_state) { case State::kPreStart: // Transition directly from PreStart to Complete if not started yet. @@ -312,6 +312,12 @@ Status InitialSyncerFCB::shutdown() { _cancelRemainingWork_inlock(); + // Ensure that storage change will not be blocked by shutdown's opCtx (first call to + // InitialSyncerFCB::shutdown comes from ReplicationCoordinatorImpl::enterTerminalShutdown + // at the moment when there is no opCtx in the shutdown thread yet). + // Wait for finish of tasks that change storage location is any is running. + _inStorageChangeCondition.wait(lock, [this] { return !_inStorageChange; }); + return Status::OK(); } @@ -1679,6 +1685,7 @@ void InitialSyncerFCB::_switchToDownloadedCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { + ChangeStorageGuard changeStorageGuard(this); stdx::unique_lock lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock(callbackArgs, "_switchToDownloadedCallback cancelled"); @@ -1829,6 +1836,7 @@ void InitialSyncerFCB::_switchToDummyToDBPathCallback( const executor::TaskExecutor::CallbackArgs& callbackArgs, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { + ChangeStorageGuard changeStorageGuard(this); stdx::unique_lock lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock( callbackArgs, "_switchToDummyToDBPathCallback cancelled"); diff --git a/src/mongo/db/repl/initial_syncer_fcb.h b/src/mongo/db/repl/initial_syncer_fcb.h index 0d2ebdcc015ce..44f82dbe3d683 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.h +++ b/src/mongo/db/repl/initial_syncer_fcb.h @@ -194,6 +194,33 @@ class InitialSyncerFCB : public InitialSyncerInterface { size_t size; }; + /** + * Guard storage changing functions from being deadlocked by shutdown. + */ + class ChangeStorageGuard { + public: + ChangeStorageGuard(InitialSyncerFCB* initialSyncer) : _initialSyncer(initialSyncer) { + stdx::lock_guard lk(_initialSyncer->_mutex); + _initialSyncer->_inStorageChange = true; + } + + ~ChangeStorageGuard() { + { + stdx::lock_guard lk(_initialSyncer->_mutex); + _initialSyncer->_inStorageChange = false; + } + _initialSyncer->_inStorageChangeCondition.notify_all(); + } + + ChangeStorageGuard(const ChangeStorageGuard&) = delete; + ChangeStorageGuard& operator=(const ChangeStorageGuard&) = delete; + ChangeStorageGuard(ChangeStorageGuard&&) = delete; + ChangeStorageGuard& operator=(ChangeStorageGuard&&) = delete; + + private: + InitialSyncerFCB* _initialSyncer; + }; + /** * Returns true if we are still processing initial sync tasks (_state is either Running or * Shutdown). @@ -590,6 +617,10 @@ class InitialSyncerFCB : public InitialSyncerInterface { // The initial sync attempt has been canceled bool _attemptCanceled = false; // (X) + + // Conditional variable to wait for end of storage change + stdx::condition_variable _inStorageChangeCondition; // (M) + bool _inStorageChange = false; // (M) }; } // namespace repl From f9487f461f42b7aa52a40acc14696675af8b8014 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Thu, 17 Oct 2024 13:46:34 +0100 Subject: [PATCH 26/32] PSMDB-1542 Fix initial sync method name in statistics log message --- src/mongo/db/repl/initial_syncer_fcb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 907d4f7541a1e..c87c3cd42a63c 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -426,7 +426,7 @@ BSONObj InitialSyncerFCB::getInitialSyncProgress() const { } void InitialSyncerFCB::_appendInitialSyncProgressMinimal_inlock(BSONObjBuilder* bob) const { - bob->append("method", "logical"); + bob->append("method", getInitialSyncMethod()); _stats.append(bob); if (!_initialSyncState) { return; From 8a836476ad5bb7d154e9efbae214ba5599193c07 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 28 Feb 2025 13:50:18 +0000 Subject: [PATCH 27/32] PSMDB-1589 Attach JournalListener to the new instance of storage engine --- src/mongo/db/repl/initial_syncer_fcb.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index c87c3cd42a63c..5023bd4be3252 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1910,6 +1910,13 @@ void InitialSyncerFCB::_finalizeAndCompleteCallback( return; } + { + auto opCtx = makeOpCtx(); + // Attach JournalListener to the new instance of storage engine + auto* journalListener = _dataReplicatorExternalState->getReplicationJournalListener(); + opCtx->getServiceContext()->getStorageEngine()->setJournalListener(journalListener); + } + // TODO: set value of _lastApplied or provide another instance of OpTimeAndWallTime // TODO: fix this temporary solution _lastApplied.opTime = _oplogEnd; From fe843ac2b1f13c8233d88d580febb6b8e78fa2b0 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 28 Feb 2025 16:12:53 +0000 Subject: [PATCH 28/32] PSMDB-1589 FCBIS test to ensure new node may work as primary --- jstests/replsets/fcbis-replication.js | 76 +++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 jstests/replsets/fcbis-replication.js diff --git a/jstests/replsets/fcbis-replication.js b/jstests/replsets/fcbis-replication.js new file mode 100644 index 0000000000000..5b80ed1a02d46 --- /dev/null +++ b/jstests/replsets/fcbis-replication.js @@ -0,0 +1,76 @@ +/** + * Tests that new node added via FCBIS works correctly as primary. + * + * @tags: [requires_wiredtiger] + */ +(function() { +'use strict'; + +load("jstests/replsets/rslib.js"); // For reconfig and isConfigCommitted. + +let addNodeConfig = function(rst, nodeId, conn) { + var config = rst.getReplSetConfigFromNode(); + config.version += 1; + config.members.push({_id: nodeId, host: conn.host}); + reconfig(rst, config); + assert.soon(() => isConfigCommitted(rst.getPrimary())); + rst.waitForConfigReplication(rst.getPrimary()); + rst.awaitReplication(); + return config; +}; + +const basenodes = 1; // <= Test will not hang if nodes > 1 + + +var rsname = 'fcbis_replset'; +var rs = new ReplSetTest({ + name: rsname, + nodes: basenodes, + nodeOptions: {verbose: 2}, +}); + +rs.startSet({ }); +rs.initiate(); + +// do fsync before FCBIS +assert.commandWorked(rs.getPrimary().adminCommand({fsync: 1})); +// assert.commandWorked(rs.getSecondary().adminCommand({fsync: 1})); + +// Add a new member that will undergo initial sync +let newNode = rs.add({ + rsConfig: {priority: 10}, + setParameter: { + 'initialSyncMethod': 'fileCopyBased', + //'initialSyncSourceReadPreference': 'primary', + }, + verbose: 2, +}); + +// wait for user input to be able to attach gdb before initial sync +//jsTest.log("--XXXX-- newNode: " + newNode.pid); +//print("Press Enter to continue"); +//let psw = passwordPrompt(); + +addNodeConfig(rs, basenodes + 1, newNode); +rs.waitForState(newNode, ReplSetTest.State.SECONDARY); +rs.waitForAllNewlyAddedRemovals(); + +jsTest.log("--XXXX-- Added new member"); + +// Output serverStatus for reference +jsTest.log("--XXXX-- newNode serverStatus: " + tojson(newNode.adminCommand({'serverStatus': 1, repl: 1}))); + +// Make the new member become primary +assert.commandWorked(newNode.adminCommand({replSetStepUp: 1})); +jsTest.log("--XXXX-- After replSetStepUp"); + +rs.awaitNodesAgreeOnPrimary(undefined, undefined, newNode); +jsTest.log("--XXXX-- All nodes agree on newNode being primary"); + +// BUG: This insert would not return and test would hang because of PSMDB-1589. This only happens when using FCBIS. +assert.commandWorked(rs.getPrimary().getDB('test').getCollection('foo').insert({x: 1})); // <= This will fail! +jsTest.log("--XXXX-- After insert on new member"); + +rs.stopSet(); +})(); + From 070bd5a181868298021fe256bfb158193b43051f Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Thu, 13 Mar 2025 08:18:27 +0000 Subject: [PATCH 29/32] PSMDB-1541 Ensure that backup cursor returns oplogEnd without holes --- .../wiredtiger_backup_cursor_hooks.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_backup_cursor_hooks.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_backup_cursor_hooks.cpp index b49774adc632e..c7b289c81d71b 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_backup_cursor_hooks.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_backup_cursor_hooks.cpp @@ -32,9 +32,11 @@ Copyright (C) 2021-present Percona and/or its affiliates. All rights reserved. #include "mongo/db/storage/wiredtiger/wiredtiger_backup_cursor_hooks.h" -#include "mongo/db/db_raii.h" +#include "mongo/db/concurrency/lock_manager_defs.h" +#include "mongo/db/concurrency/replication_state_transition_lock_guard.h" #include "mongo/db/dbhelpers.h" #include "mongo/db/operation_context.h" +#include "mongo/db/repl/storage_interface.h" #include "mongo/db/storage/encryption_hooks.h" #include "mongo/logv2/log.h" #include "mongo/logv2/log_options.h" @@ -97,6 +99,8 @@ BackupCursorState WiredTigerBackupCursorHooks::openBackupCursor( "$backupCursor can succeed.", _state != kHotBackup); + repl::ReplicationStateTransitionLockGuard replTransitionLock(opCtx, MODE_IX); + // Replica sets must also return the opTime's of the earliest and latest oplog entry. The // range represented by the oplog start/end values must exist in the backup copy, but are // not expected to be exact. @@ -106,15 +110,12 @@ BackupCursorState WiredTigerBackupCursorHooks::openBackupCursor( // If the oplog exists, capture the last oplog entry before opening the backup cursor. This // value will be checked again after the cursor is established to guarantee it still exists // (and was not truncated before the backup cursor was established. - { - AutoGetCollectionForRead coll(opCtx, NamespaceString::kRsOplogNamespace); - if (coll.getCollection()) { - BSONObj lastEntry; - if (Helpers::getLast(opCtx, NamespaceString::kRsOplogNamespace, lastEntry)) { - auto oplogEntry = fassertNoTrace(50913, repl::OplogEntry::parse(lastEntry)); - oplogEnd = oplogEntry.getOpTime(); - } - } + auto replCoordinator = repl::ReplicationCoordinator::get(opCtx); + // Using UNSAFE version because we have RSTL acquired. + if (replCoordinator->isReplEnabled() && replCoordinator->isInPrimaryOrSecondaryState_UNSAFE()) { + oplogEnd = replCoordinator->getMyLastAppliedOpTime(); + // ensure there are no oplog holes before oplogEnd + repl::StorageInterface::get(opCtx)->waitForAllEarlierOplogWritesToBeVisible(opCtx); } auto* engine = opCtx->getServiceContext()->getStorageEngine(); From 8fa4c2801191efa899c89fbad0acc1a7eb5b4629 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Fri, 28 Mar 2025 20:06:41 +0000 Subject: [PATCH 30/32] PSMDB-1523 denylist sync source if FCBIS attempt has failed --- src/mongo/db/repl/initial_syncer_fcb.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 5023bd4be3252..8ee919683fcf8 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -1062,6 +1062,13 @@ void InitialSyncerFCB::_finishInitialSyncAttempt(const StatusWithnow() + _opts.syncSourceRetryWait * 2; + _opts.syncSourceSelector->denylistSyncSource(_syncSource, until); + } + _attemptExec = std::make_unique( _exec, Status(ErrorCodes::CallbackCanceled, "Initial Sync Attempt Canceled")); _clonerAttemptExec = std::make_unique( @@ -1469,7 +1476,7 @@ void InitialSyncerFCB::_fetchBackupCursorCallback( return aggRequest.toBSON(BSONObj()); }(); - LOGV2_DEBUG(128407, 1, "Opening backup cursor on sync source"); + LOGV2_DEBUG(128407, 1, "Opening backup cursor on sync source", "syncSource"_attr = _syncSource); auto fetchStatus = std::make_shared>(); const auto fetcherCallback = [this, fetchStatus](const Fetcher::QueryResponseStatus& dataStatus, From a28655bdb00c595803132bc1363c89191cb252e7 Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Tue, 1 Apr 2025 09:53:01 +0100 Subject: [PATCH 31/32] PSMDB-1632 Remove 'FCB' suffix from FCBIS fail point names --- src/mongo/db/repl/initial_syncer_fcb.cpp | 68 ++++++++++++------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index da5910a629aea..208bf1b188517 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -112,32 +112,32 @@ namespace mongo { namespace repl { // Failpoint for initial sync -MONGO_FAIL_POINT_DEFINE(failInitialSyncWithBadHostFCB); +extern FailPoint failInitialSyncWithBadHost; // Failpoint which causes the initial sync function to hang before creating shared data and // splitting control flow between the oplog fetcher and the cloners. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeSplittingControlFlowFCB); +extern FailPoint initialSyncHangBeforeSplittingControlFlow; // Failpoint which causes the initial sync function to hang before copying databases. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCopyingDatabasesFCB); +extern FailPoint initialSyncHangBeforeCopyingDatabases; // Failpoint which causes the initial sync function to hang before finishing. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeFinishFCB); +extern FailPoint initialSyncHangBeforeFinish; // Failpoint which causes the initial sync function to hang before creating the oplog. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeCreatingOplogFCB); +extern FailPoint initialSyncHangBeforeCreatingOplog; // Failpoint which skips clearing _initialSyncState after a successful initial sync attempt. -MONGO_FAIL_POINT_DEFINE(skipClearInitialSyncStateFCB); +extern FailPoint skipClearInitialSyncState; // Failpoint which causes the initial sync function to fail and hang before starting a new attempt. -MONGO_FAIL_POINT_DEFINE(failAndHangInitialSyncFCB); +extern FailPoint failAndHangInitialSync; // Failpoint which causes the initial sync function to hang before choosing a sync source. -MONGO_FAIL_POINT_DEFINE(initialSyncHangBeforeChoosingSyncSourceFCB); +extern FailPoint initialSyncHangBeforeChoosingSyncSource; // Failpoint which causes the initial sync function to hang after finishing. -MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterFinishFCB); +extern FailPoint initialSyncHangAfterFinish; namespace { using namespace executor; @@ -650,9 +650,9 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( std::uint32_t chooseSyncSourceAttempt, std::uint32_t chooseSyncSourceMaxAttempts, std::shared_ptr onCompletionGuard) noexcept try { - if (MONGO_unlikely(initialSyncHangBeforeChoosingSyncSourceFCB.shouldFail())) { - LOGV2(128428, "initialSyncHangBeforeChoosingSyncSourceFCB fail point enabled"); - initialSyncHangBeforeChoosingSyncSourceFCB.pauseWhileSet(); + if (MONGO_unlikely(initialSyncHangBeforeChoosingSyncSource.shouldFail())) { + LOGV2(128428, "initialSyncHangBeforeChoosingSyncSource fail point enabled"); + initialSyncHangBeforeChoosingSyncSource.pauseWhileSet(); } stdx::unique_lock lock(_mutex); @@ -666,9 +666,9 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( return; } - if (MONGO_unlikely(failInitialSyncWithBadHostFCB.shouldFail())) { + if (MONGO_unlikely(failInitialSyncWithBadHost.shouldFail())) { status = Status(ErrorCodes::InvalidSyncSource, - "initial sync failed - failInitialSyncWithBadHostFCB failpoint is set."); + "initial sync failed - failInitialSyncWithBadHost failpoint is set."); onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; } @@ -712,13 +712,13 @@ void InitialSyncerFCB::_chooseSyncSourceCallback( return; } - if (MONGO_unlikely(initialSyncHangBeforeCreatingOplogFCB.shouldFail())) { + if (MONGO_unlikely(initialSyncHangBeforeCreatingOplog.shouldFail())) { // This log output is used in js tests so please leave it. LOGV2(128430, - "initial sync - initialSyncHangBeforeCreatingOplogFCB fail point " + "initial sync - initialSyncHangBeforeCreatingOplog fail point " "enabled. Blocking until fail point is disabled."); lock.unlock(); - while (MONGO_unlikely(initialSyncHangBeforeCreatingOplogFCB.shouldFail()) && + while (MONGO_unlikely(initialSyncHangBeforeCreatingOplog.shouldFail()) && !_isShuttingDown()) { mongo::sleepsecs(1); } @@ -894,12 +894,12 @@ void InitialSyncerFCB::_fcvFetcherCallback(const StatusWith(_sharedData->getTotalTimeUnreachable(sdLock)); } - if (MONGO_unlikely(failAndHangInitialSyncFCB.shouldFail())) { - LOGV2(128441, "failAndHangInitialSyncFCB fail point enabled"); - failAndHangInitialSyncFCB.pauseWhileSet(); - result = Status(ErrorCodes::InternalError, "failAndHangInitialSyncFCB fail point enabled"); + if (MONGO_unlikely(failAndHangInitialSync.shouldFail())) { + LOGV2(128441, "failAndHangInitialSync fail point enabled"); + failAndHangInitialSync.pauseWhileSet(); + result = Status(ErrorCodes::InternalError, "failAndHangInitialSync fail point enabled"); } _stats.initialSyncAttemptInfos.emplace_back( @@ -1111,12 +1111,12 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied std::swap(_onCompletion, onCompletion); } - if (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail())) { + if (MONGO_unlikely(initialSyncHangBeforeFinish.shouldFail())) { // This log output is used in js tests so please leave it. LOGV2(128444, - "initial sync - initialSyncHangBeforeFinishFCB fail point " + "initial sync - initialSyncHangBeforeFinish fail point " "enabled. Blocking until fail point is disabled."); - while (MONGO_unlikely(initialSyncHangBeforeFinishFCB.shouldFail()) && !_isShuttingDown()) { + while (MONGO_unlikely(initialSyncHangBeforeFinish.shouldFail()) && !_isShuttingDown()) { mongo::sleepsecs(1); } } @@ -1148,7 +1148,7 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied // Clear the initial sync progress after an initial sync attempt has been successfully // completed. - if (lastApplied.isOK() && !MONGO_unlikely(skipClearInitialSyncStateFCB.shouldFail())) { + if (lastApplied.isOK() && !MONGO_unlikely(skipClearInitialSyncState.shouldFail())) { _initialSyncState.reset(); } @@ -1159,11 +1159,11 @@ void InitialSyncerFCB::_finishCallback(StatusWith lastApplied _exec = nullptr; } - if (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail())) { + if (MONGO_unlikely(initialSyncHangAfterFinish.shouldFail())) { LOGV2(128446, - "initial sync finished - initialSyncHangAfterFinishFCB fail point " + "initial sync finished - initialSyncHangAfterFinish fail point " "enabled. Blocking until fail point is disabled."); - while (MONGO_unlikely(initialSyncHangAfterFinishFCB.shouldFail()) && !_isShuttingDown()) { + while (MONGO_unlikely(initialSyncHangAfterFinish.shouldFail()) && !_isShuttingDown()) { mongo::sleepsecs(1); } } From 5f82e7ebd6f88b966507f6c16c620482e97fadbf Mon Sep 17 00:00:00 2001 From: Igor Solodovnikov Date: Thu, 6 Mar 2025 11:10:29 +0000 Subject: [PATCH 32/32] PSMDB-1541 added initialSyncHangAfterCloningFiles fail point This fail point will stop right after cloning files from sync source allowing js tests to do something during FCBIS but before its final steps --- src/mongo/db/repl/initial_syncer_fcb.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/mongo/db/repl/initial_syncer_fcb.cpp b/src/mongo/db/repl/initial_syncer_fcb.cpp index 208bf1b188517..766a1463f3e2a 100644 --- a/src/mongo/db/repl/initial_syncer_fcb.cpp +++ b/src/mongo/db/repl/initial_syncer_fcb.cpp @@ -139,6 +139,9 @@ extern FailPoint initialSyncHangBeforeChoosingSyncSource; // Failpoint which causes the initial sync function to hang after finishing. extern FailPoint initialSyncHangAfterFinish; +// Failpoint which causes the initial sync function to hang after cloning files. +MONGO_FAIL_POINT_DEFINE(initialSyncHangAfterCloningFiles); + namespace { using namespace executor; using CallbackArgs = executor::TaskExecutor::CallbackArgs; @@ -1606,7 +1609,8 @@ void InitialSyncerFCB::_transferFileCallback( std::size_t fileIdx, // NOLINTNEXTLINE(*-unnecessary-value-param) std::shared_ptr onCompletionGuard) noexcept try { - stdx::lock_guard lock(_mutex); + // stdx::lock_guard lock(_mutex); + stdx::unique_lock lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock( callbackArgs, "error transferring file from sync source"); if (!status.isOK()) { @@ -1669,6 +1673,20 @@ void InitialSyncerFCB::_transferFileCallback( onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); return; } + if (MONGO_unlikely(initialSyncHangAfterCloningFiles.shouldFail())) { + // This could have been done with a scheduleWorkAt but this is used only by JS tests + // where we run with multiple threads so it's fine to spin on this thread. This log + // output is used in js tests so please leave it. + LOGV2(128447, + "initial sync - initialSyncHangAfterCloningFiles fail point " + "enabled. Blocking until fail point is disabled."); + lock.unlock(); + while (MONGO_unlikely(initialSyncHangAfterCloningFiles.shouldFail()) && + !_isShuttingDown()) { + mongo::sleepsecs(1); + } + lock.lock(); + } // schedule next task status = _scheduleWorkAndSaveHandle_inlock( [this, onCompletionGuard](const executor::TaskExecutor::CallbackArgs& args) {