Skip to content

Separate backup and restore into two workloads [release-7.4] #12172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: release-7.4
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
414 changes: 272 additions & 142 deletions fdbclient/FileBackupAgent.actor.cpp

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions fdbclient/ManagementAPI.actor.cpp
Original file line number Diff line number Diff line change
@@ -517,6 +517,34 @@ bool isCompleteConfiguration(std::map<std::string, std::string> const& options)
options.count(p + "storage_engine") == 1;
}

ACTOR Future<Void> disableBackupWorker(Database cx) {
DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
if (!configuration.backupWorkerEnabled) {
TraceEvent("BackupWorkerAlreadyDisabled");
return Void();
}
ConfigurationResult res = wait(ManagementAPI::changeConfig(cx.getReference(), "backup_worker_enabled:=0", true));
if (res != ConfigurationResult::SUCCESS) {
TraceEvent("BackupWorkerDisableFailed").detail("Result", res);
throw operation_failed();
}
return Void();
}

ACTOR Future<Void> enableBackupWorker(Database cx) {
DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
if (configuration.backupWorkerEnabled) {
TraceEvent("BackupWorkerAlreadyEnabled");
return Void();
}
ConfigurationResult res = wait(ManagementAPI::changeConfig(cx.getReference(), "backup_worker_enabled:=1", true));
if (res != ConfigurationResult::SUCCESS) {
TraceEvent("BackupWorkerEnableFailed").detail("Result", res);
throw operation_failed();
}
return Void();
}

/*
- Validates encryption and tenant mode configurations
- During cluster creation (configure new) we allow the following:
39 changes: 20 additions & 19 deletions fdbclient/include/fdbclient/BackupAgent.actor.h
Original file line number Diff line number Diff line change
@@ -25,14 +25,15 @@
#elif !defined(FDBCLIENT_BACKUP_AGENT_ACTOR_H)
#define FDBCLIENT_BACKUP_AGENT_ACTOR_H

#include <ctime>
#include <climits>

#include "flow/flow.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/TaskBucket.h"
#include "fdbclient/Notified.h"
#include "flow/IAsyncFile.h"
#include "fdbclient/KeyBackedTypes.actor.h"
#include <ctime>
#include <climits>
#include "fdbclient/BackupContainer.h"
#include "flow/actorcompiler.h" // has to be last include

@@ -205,8 +206,7 @@ class FileBackupAgent : public BackupAgentBase {
OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
Optional<std::string> const& encryptionKeyFileName = {},
Optional<std::string> blobManifestUrl = {},
TransformPartitionedLog transformPartitionedLog = TransformPartitionedLog::False);
Optional<std::string> blobManifestUrl = {});

// this method will construct range and version vectors and then call restore()
Future<Version> restore(Database cx,
@@ -245,8 +245,7 @@ class FileBackupAgent : public BackupAgentBase {
InconsistentSnapshotOnly inconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
Version beginVersion = ::invalidVersion,
Optional<std::string> const& encryptionKeyFileName = {},
Optional<std::string> blobManifestUrl = {},
TransformPartitionedLog transformPartitionedLog = TransformPartitionedLog::False);
Optional<std::string> blobManifestUrl = {});

Future<Version> atomicRestore(Database cx,
Key tagName,
@@ -314,14 +313,16 @@ class FileBackupAgent : public BackupAgentBase {
partitionedLog,
incrementalBackupOnly,
encryptionKeyFileName,
blobManifestUrl);
blobManifestUrl) +
checkAndDisableBackupWorkers(cx);
});
}

Future<Void> discontinueBackup(Reference<ReadYourWritesTransaction> tr, Key tagName);
Future<Void> discontinueBackup(Database cx, Key tagName) {
return runRYWTransaction(
cx, [=](Reference<ReadYourWritesTransaction> tr) { return discontinueBackup(tr, tagName); });
cx, [=](Reference<ReadYourWritesTransaction> tr) { return discontinueBackup(tr, tagName); }) +
checkAndDisableBackupWorkers(cx);
}

// Terminate an ongoing backup, without waiting for the backup to finish.
@@ -333,9 +334,15 @@ class FileBackupAgent : public BackupAgentBase {
// logRangesRange and backupLogKeys will be cleared for this backup.
Future<Void> abortBackup(Reference<ReadYourWritesTransaction> tr, std::string tagName);
Future<Void> abortBackup(Database cx, std::string tagName) {
return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) { return abortBackup(tr, tagName); });
// First abort the backup, then check and disable backup workers if needed.
return runRYWTransaction(cx,
[=](Reference<ReadYourWritesTransaction> tr) { return abortBackup(tr, tagName); }) +
checkAndDisableBackupWorkers(cx);
}

// Disable backup workers if no active partitioned backup is running.
Future<Void> checkAndDisableBackupWorkers(Database cx);

Future<std::string> getStatus(Database cx, ShowErrors, std::string tagName);
Future<std::string> getStatusJSON(Database cx, std::string tagName);

@@ -895,9 +902,6 @@ class BackupConfig : public KeyBackedTaskConfig {
return configSpace.pack(__FUNCTION__sr);
}

// Set to true if backup worker is enabled.
KeyBackedProperty<bool> backupWorkerEnabled() { return configSpace.pack(__FUNCTION__sr); }

// Set to true if partitioned log is enabled (only useful if backup worker is also enabled).
KeyBackedProperty<bool> partitionedLogEnabled() { return configSpace.pack(__FUNCTION__sr); }

@@ -929,18 +933,15 @@ class BackupConfig : public KeyBackedTaskConfig {
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
auto lastLog = latestLogEndVersion().get(tr);
auto firstSnapshot = firstSnapshotEndVersion().get(tr);
auto workerEnabled = backupWorkerEnabled().get(tr);
auto plogEnabled = partitionedLogEnabled().get(tr);
auto workerVersion = latestBackupWorkerSavedVersion().get(tr);
auto incrementalBackup = incrementalBackupOnly().get(tr);
return map(success(lastLog) && success(firstSnapshot) && success(workerEnabled) && success(plogEnabled) &&
success(workerVersion) && success(incrementalBackup),
return map(success(lastLog) && success(firstSnapshot) && success(plogEnabled) && success(workerVersion) &&
success(incrementalBackup),
[=](Void) -> Optional<Version> {
// The latest log greater than the oldest snapshot is the restorable version
Optional<Version> logVersion = workerEnabled.get().present() && workerEnabled.get().get() &&
plogEnabled.get().present() && plogEnabled.get().get()
? workerVersion.get()
: lastLog.get();
Optional<Version> logVersion =
plogEnabled.get().present() && plogEnabled.get().get() ? workerVersion.get() : lastLog.get();
if (logVersion.present() && firstSnapshot.get().present() &&
logVersion.get() > firstSnapshot.get().get()) {
return std::max(logVersion.get() - 1, firstSnapshot.get().get());
3 changes: 3 additions & 0 deletions fdbclient/include/fdbclient/ManagementAPI.actor.h
Original file line number Diff line number Diff line change
@@ -294,5 +294,8 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

ACTOR Future<Void> disableBackupWorker(Database cx);
ACTOR Future<Void> enableBackupWorker(Database cx);

#include "flow/unactorcompiler.h"
#endif
2 changes: 0 additions & 2 deletions fdbserver/QuietDatabase.actor.cpp
Original file line number Diff line number Diff line change
@@ -1015,8 +1015,6 @@ ACTOR Future<Void> disableConsistencyScanInSim(Database db, bool waitForCompleti
return Void();
}

ACTOR Future<Void> disableBackupWorker(Database cx);

// Waits until a database quiets down (no data in flight, small tlog queue, low SQ, no active data distribution). This
// requires the database to be available and healthy in order to succeed.
ACTOR Future<Void> waitForQuietDatabase(Database cx,
14 changes: 7 additions & 7 deletions fdbserver/SimulatedCluster.actor.cpp
Original file line number Diff line number Diff line change
@@ -646,7 +646,8 @@ T simulate(const T& in) {
}

ACTOR Future<Void> runBackup(Reference<IClusterConnectionRecord> connRecord) {
state std::vector<Future<Void>> agentFutures;
state Future<Void> agentFuture;
state FileBackupAgent fileAgent;

while (g_simulator->backupAgents == ISimulator::BackupAgentType::WaitForType) {
wait(delay(1.0));
@@ -655,17 +656,16 @@ ACTOR Future<Void> runBackup(Reference<IClusterConnectionRecord> connRecord) {
if (g_simulator->backupAgents == ISimulator::BackupAgentType::BackupToFile) {
Database cx = Database::createDatabase(connRecord, ApiVersion::LATEST_VERSION);

state FileBackupAgent fileAgent;
agentFutures.push_back(fileAgent.run(
cx, 1.0 / CLIENT_KNOBS->BACKUP_AGGREGATE_POLL_RATE, CLIENT_KNOBS->SIM_BACKUP_TASKS_PER_AGENT));
TraceEvent("SimBackupAgentsStarting").log();
agentFuture =
fileAgent.run(cx, 1.0 / CLIENT_KNOBS->BACKUP_AGGREGATE_POLL_RATE, CLIENT_KNOBS->SIM_BACKUP_TASKS_PER_AGENT);

while (g_simulator->backupAgents == ISimulator::BackupAgentType::BackupToFile) {
wait(delay(1.0));
}

for (auto it : agentFutures) {
it.cancel();
}
TraceEvent("SimBackupAgentsStopping").log();
agentFuture.cancel();
}

wait(Future<Void>(Never()));
28 changes: 0 additions & 28 deletions fdbserver/tester.actor.cpp
Original file line number Diff line number Diff line change
@@ -2665,34 +2665,6 @@ ACTOR Future<Void> disableConnectionFailuresAfter(double seconds, std::string co
return Void();
}

ACTOR Future<Void> disableBackupWorker(Database cx) {
DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
if (!configuration.backupWorkerEnabled) {
TraceEvent("BackupWorkerAlreadyDisabled");
return Void();
}
ConfigurationResult res = wait(ManagementAPI::changeConfig(cx.getReference(), "backup_worker_enabled:=0", true));
if (res != ConfigurationResult::SUCCESS) {
TraceEvent("BackupWorkerDisableFailed").detail("Result", res);
throw operation_failed();
}
return Void();
}

ACTOR Future<Void> enableBackupWorker(Database cx) {
DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
if (configuration.backupWorkerEnabled) {
TraceEvent("BackupWorkerAlreadyEnabled");
return Void();
}
ConfigurationResult res = wait(ManagementAPI::changeConfig(cx.getReference(), "backup_worker_enabled:=1", true));
if (res != ConfigurationResult::SUCCESS) {
TraceEvent("BackupWorkerEnableFailed").detail("Result", res);
throw operation_failed();
}
return Void();
}

/**
* \brief Test orchestrator: sends test specification to testers in the right order and collects the results.
*
362 changes: 362 additions & 0 deletions fdbserver/workloads/Backup.actor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,362 @@
/*
* Backup.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2025 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/IRandom.h"
#include "flow/actorcompiler.h" // This must be the last #include.

// A workload which only performs backup operations. A seperate workload is used to perform restore operations.
struct BackupWorkload : TestWorkload {
static constexpr auto NAME = "Backup";
double backupAfter, restoreAfter, abortAndRestartAfter;
double minBackupAfter;
double backupStartAt, restoreStartAfterBackupFinished, stopDifferentialAfter;
Key backupTag;
bool differentialBackup;
Standalone<VectorRef<KeyRangeRef>> backupRanges;
LockDB locked{ false };
UsePartitionedLog usePartitionedLog{ true };
bool allowPauses;
Optional<std::string> encryptionKeyFileName;

BackupWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
locked.set(sharedRandomNumber % 2);
bool partitioned = getOption(options, "usePartitionedLog"_sr, true);
usePartitionedLog.set(partitioned);
backupAfter = getOption(options, "backupAfter"_sr, 10.0);
double minBackupAfter = getOption(options, "minBackupAfter"_sr, backupAfter);
if (backupAfter > minBackupAfter) {
backupAfter = deterministicRandom()->random01() * (backupAfter - minBackupAfter) + minBackupAfter;
}
restoreAfter = getOption(options, "restoreAfter"_sr, 35.0);
backupTag = getOption(options, "backupTag"_sr, BackupAgentBase::getDefaultTag());
abortAndRestartAfter =
getOption(options,
"abortAndRestartAfter"_sr,
deterministicRandom()->random01() < 0.5
? deterministicRandom()->random01() * (restoreAfter - backupAfter) + backupAfter
: 0.0);
differentialBackup =
getOption(options, "differentialBackup"_sr, deterministicRandom()->random01() < 0.5 ? true : false);
stopDifferentialAfter =
getOption(options,
"stopDifferentialAfter"_sr,
differentialBackup ? deterministicRandom()->random01() *
(restoreAfter - std::max(abortAndRestartAfter, backupAfter)) +
std::max(abortAndRestartAfter, backupAfter)
: 0.0);
allowPauses = getOption(options, "allowPauses"_sr, true);

std::vector<std::string> restorePrefixesToInclude =
getOption(options, "restorePrefixesToInclude"_sr, std::vector<std::string>());

if (getOption(options, "encrypted"_sr, deterministicRandom()->random01() < 0.1)) {
encryptionKeyFileName = "simfdb/" + getTestEncryptionFileName();
}

TraceEvent("BW_ClientId").detail("Id", wcx.clientId);
backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
}

Future<Void> setup(Database const& cx) override { return Void(); }

Future<Void> start(Database const& cx) override {
if (clientId != 0)
return Void();

TraceEvent(SevInfo, "BW_Param")
.detail("Locked", locked)
.detail("BackupAfter", backupAfter)
.detail("RestoreAfter", restoreAfter)
.detail("BackupTag", printable(backupTag).c_str())
.detail("AbortAndRestartAfter", abortAndRestartAfter)
.detail("DifferentialBackup", differentialBackup)
.detail("StopDifferentialAfter", stopDifferentialAfter)
.detail("Encrypted", encryptionKeyFileName.present());

return _start(cx, this);
}

Future<bool> check(Database const& cx) override { return true; }

void getMetrics(std::vector<PerfMetric>& m) override {}

ACTOR static Future<Void> changePaused(Database cx, FileBackupAgent* backupAgent) {
loop {
wait(backupAgent->changePause(cx, true));
TraceEvent("BW_AgentPaused").log();
wait(delay(30 * deterministicRandom()->random01()));
wait(backupAgent->changePause(cx, false));
TraceEvent("BW_AgentResumed").log();
wait(delay(120 * deterministicRandom()->random01()));
}
}

// Resume the backup agent if it is paused
ACTOR static Future<Void> resumeAgent(Database cx, FileBackupAgent* backupAgent) {
TraceEvent("BW_AgentResuming").log();
wait(backupAgent->changePause(cx, false));
TraceEvent("BW_AgentResumed").log();
return Void();
}

ACTOR static Future<Void> statusLoop(Database cx, std::string tag) {
state FileBackupAgent agent;
loop {
bool active = wait(agent.checkActive(cx));
TraceEvent("BW_AgentActivityCheck").detail("IsActive", active);
std::string status = wait(agent.getStatus(cx, ShowErrors::True, tag));
puts(status.c_str());
std::string statusJSON = wait(agent.getStatusJSON(cx, tag));
puts(statusJSON.c_str());
wait(delay(2.0));
}
}

ACTOR static Future<Void> doBackup(BackupWorkload* self,
double startDelay,
FileBackupAgent* backupAgent,
Database cx,
Key tag,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
double stopDifferentialDelay) {

state UID randomID = nondeterministicRandom()->randomUniqueID();

state Future<Void> stopDifferentialFuture = delay(stopDifferentialDelay);
wait(delay(startDelay));

if (startDelay || BUGGIFY) {
TraceEvent("BW_DoBackupAbortBackup1", randomID)
.detail("Tag", printable(tag))
.detail("StartDelay", startDelay);

try {
wait(backupAgent->abortBackup(cx, tag.toString()));
} catch (Error& e) {
TraceEvent("BW_DoBackupAbortBackupException", randomID).error(e).detail("Tag", printable(tag));
if (e.code() != error_code_backup_unneeded)
throw;
}
}

TraceEvent("BW_DoBackupSubmitBackup", randomID)
.detail("Tag", printable(tag))
.detail("StopWhenDone", stopDifferentialDelay ? "False" : "True");

state std::string backupContainer = "file://simfdb/backups/";
state Future<Void> status = statusLoop(cx, tag.toString());
try {
wait(backupAgent->submitBackup(cx,
StringRef(backupContainer),
{},
deterministicRandom()->randomInt(0, 60),
deterministicRandom()->randomInt(0, 2000),
tag.toString(),
backupRanges,
false,
StopWhenDone{ !stopDifferentialDelay },
self->usePartitionedLog,
IncrementalBackupOnly::False,
self->encryptionKeyFileName));
} catch (Error& e) {
TraceEvent("BW_DoBackupSubmitBackupException", randomID).error(e).detail("Tag", printable(tag));
if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate)
throw;
}

// Stop the differential backup, if enabled
if (stopDifferentialDelay) {
CODE_PROBE(!stopDifferentialFuture.isReady(),
"Restore starts at specified time - stopDifferential not ready");
wait(stopDifferentialFuture);
TraceEvent("BW_DoBackupWaitToDiscontinue", randomID)
.detail("Tag", printable(tag))
.detail("DifferentialAfter", stopDifferentialDelay);

try {
if (BUGGIFY) {
state KeyBackedTag backupTag = makeBackupTag(tag.toString());
TraceEvent("BW_DoBackupWaitForRestorable", randomID).detail("Tag", backupTag.tagName);

// Wait until the backup is in a restorable state and get the status, URL, and UID atomically
state Reference<IBackupContainer> lastBackupContainer;
state UID lastBackupUID;
state EBackupState resultWait = wait(backupAgent->waitBackup(
cx, backupTag.tagName, StopWhenDone::False, &lastBackupContainer, &lastBackupUID));

TraceEvent("BW_DoBackupWaitForRestorable", randomID)
.detail("Tag", backupTag.tagName)
.detail("Result", BackupAgentBase::getStateText(resultWait));

state bool restorable = false;
if (lastBackupContainer) {
state Future<BackupDescription> fdesc = lastBackupContainer->describeBackup();
wait(ready(fdesc));

if (!fdesc.isError()) {
state BackupDescription desc = fdesc.get();
wait(desc.resolveVersionTimes(cx));
printf("BackupDescription:\n%s\n", desc.toString().c_str());
restorable = desc.maxRestorableVersion.present();
}
}

TraceEvent("BW_LastBackupContainer", randomID)
.detail("BackupTag", printable(tag))
.detail("LastBackupContainer", lastBackupContainer ? lastBackupContainer->getURL() : "")
.detail("LastBackupUID", lastBackupUID)
.detail("WaitStatus", BackupAgentBase::getStateText(resultWait))
.detail("Restorable", restorable);

// Do not check the backup, if aborted
if (resultWait == EBackupState::STATE_ABORTED) {
}
// Ensure that a backup container was found
else if (!lastBackupContainer) {
TraceEvent(SevError, "BW_MissingBackupContainer", randomID)
.detail("LastBackupUID", lastBackupUID)
.detail("BackupTag", printable(tag))
.detail("WaitStatus", BackupAgentBase::getStateText(resultWait));
printf("BackupCorrectnessMissingBackupContainer tag: %s status: %s\n",
printable(tag).c_str(),
BackupAgentBase::getStateText(resultWait));
}
// Check that backup is restorable
else if (!restorable) {
TraceEvent(SevError, "BW_NotRestorable", randomID)
.detail("LastBackupUID", lastBackupUID)
.detail("BackupTag", printable(tag))
.detail("BackupFolder", lastBackupContainer->getURL())
.detail("WaitStatus", BackupAgentBase::getStateText(resultWait));
printf("BackupCorrectnessNotRestorable: tag: %s\n", printable(tag).c_str());
}

// Abort the backup, if not the first backup because the second backup may have aborted the backup
// by now
if (startDelay) {
TraceEvent("BW_DoBackupAbortBackup2", randomID)
.detail("Tag", printable(tag))
.detail("WaitStatus", BackupAgentBase::getStateText(resultWait))
.detail("LastBackupContainer", lastBackupContainer ? lastBackupContainer->getURL() : "")
.detail("Restorable", restorable);
wait(backupAgent->abortBackup(cx, tag.toString()));
} else {
TraceEvent("BW_DoBackupDiscontinueBackup", randomID)
.detail("Tag", printable(tag))
.detail("DifferentialAfter", stopDifferentialDelay);
wait(backupAgent->discontinueBackup(cx, tag));
}
}

else {
TraceEvent("BW_DoBackupDiscontinueBackup", randomID)
.detail("Tag", printable(tag))
.detail("DifferentialAfter", stopDifferentialDelay);
wait(backupAgent->discontinueBackup(cx, tag));
}
} catch (Error& e) {
TraceEvent("BW_DoBackupDiscontinueBackupException", randomID).error(e).detail("Tag", printable(tag));
if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate)
throw;
}
}

// Wait for the backup to complete
TraceEvent("BW_DoBackupWaitBackup", randomID).detail("Tag", printable(tag));
state EBackupState statusValue = wait(backupAgent->waitBackup(cx, tag.toString(), StopWhenDone::True));

std::string statusText = wait(backupAgent->getStatus(cx, ShowErrors::True, tag.toString()));
// Can we validate anything about status?

TraceEvent("BW_DoBackupComplete", randomID)
.detail("Tag", printable(tag))
.detail("Status", statusText)
.detail("StatusValue", BackupAgentBase::getStateText(statusValue));

return Void();
}

ACTOR static Future<Void> _start(Database cx, BackupWorkload* self) {
state FileBackupAgent backupAgent;
state Future<Void> cp;
state bool extraTasks = false;
TraceEvent("BW_Arguments")
.detail("BackupTag", printable(self->backupTag))
.detail("BackupAfter", self->backupAfter)
.detail("RestoreAfter", self->restoreAfter)
.detail("AbortAndRestartAfter", self->abortAndRestartAfter)
.detail("DifferentialAfter", self->stopDifferentialAfter);

state UID randomID = nondeterministicRandom()->randomUniqueID();
if (self->allowPauses && BUGGIFY) {
cp = changePaused(cx, &backupAgent);
} else {
cp = resumeAgent(cx, &backupAgent);
}

if (self->encryptionKeyFileName.present()) {
wait(BackupContainerFileSystem::createTestEncryptionKeyFile(self->encryptionKeyFileName.get()));
}

try {
state Future<Void> startRestore = delay(self->restoreAfter);

// backup
wait(delay(self->backupAfter));

TraceEvent("BW_DoBackup1", randomID).detail("Tag", printable(self->backupTag));
state Future<Void> b =
doBackup(self, 0, &backupAgent, cx, self->backupTag, self->backupRanges, self->stopDifferentialAfter);

TraceEvent("BW_DoBackupWait", randomID)
.detail("BackupTag", printable(self->backupTag))
.detail("AbortAndRestartAfter", self->abortAndRestartAfter);
try {
wait(b);
} catch (Error& e) {
if (e.code() != error_code_database_locked)
throw;
return Void();
}
TraceEvent("BW_DoBackupDone", randomID)
.detail("BackupTag", printable(self->backupTag))
.detail("AbortAndRestartAfter", self->abortAndRestartAfter);

wait(startRestore);

// We can't remove after backup agents since the restore also needs them.
// I.e., g_simulator->backupAgents = ISimulator::BackupAgentType::NoBackupAgents
} catch (Error& e) {
TraceEvent(SevError, "BackupCorrectness").error(e).GetLastError();
throw;
}
return Void();
}
};

WorkloadFactory<BackupWorkload> BackupWorkloadFactory;
6 changes: 2 additions & 4 deletions fdbserver/workloads/BackupCorrectnessPartitioned.actor.cpp
Original file line number Diff line number Diff line change
@@ -501,8 +501,7 @@ struct BackupAndRestorePartitionedCorrectnessWorkload : TestWorkload {
InconsistentSnapshotOnly::False,
::invalidVersion,
self->encryptionKeyFileName,
{},
TransformPartitionedLog::True)));
{})));
printf("BackupCorrectness, backupAgent.restore finished for tag:%s\n", restoreTag.toString().c_str());
return Void();
}
@@ -659,8 +658,7 @@ struct BackupAndRestorePartitionedCorrectnessWorkload : TestWorkload {
InconsistentSnapshotOnly::False,
::invalidVersion,
self->encryptionKeyFileName,
{},
TransformPartitionedLog::True));
{}));

wait(waitForAll(restores));

380 changes: 380 additions & 0 deletions fdbserver/workloads/Restore.actor.cpp

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -483,7 +483,10 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES slow/ApiCorrectnessAtomicRestore.toml)
add_fdb_test(TEST_FILES slow/ApiCorrectnessSwitchover.toml)
add_fdb_test(TEST_FILES slow/ApiCorrectnessWithConsistencyCheck.toml)
add_fdb_test(TEST_FILES slow/BackupAndRestore.toml)
add_fdb_test(TEST_FILES slow/BackupCorrectnessPartitioned.toml)
add_fdb_test(TEST_FILES slow/BackupNewAndOldRestore.toml)
add_fdb_test(TEST_FILES slow/BackupOldAndNewRestore.toml)
add_fdb_test(TEST_FILES slow/ClogWithRollbacks.toml)
add_fdb_test(TEST_FILES slow/CloggedCycleTest.toml)
add_fdb_test(TEST_FILES slow/CloggedStorefront.toml)
47 changes: 47 additions & 0 deletions tests/slow/BackupAndRestore.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
testClass = "Backup"

[configuration]
tenantModes = ['disabled'] # Do not support tenant
encryptModes = ['disabled'] # Do not support encryption

[[test]]
testTitle = 'BackupPartitioned'
clearAfterTest = false
runConsistencyCheck = false
waitForQuiescence = false
simBackupAgents = 'BackupToFile'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 30.0
expectedRate = 0

[[test.workload]]
testName = 'Backup'
usePartitionedLog = true
backupAfter = 10.0
restoreAfter = 60.0

[[test]]
testTitle = 'RestorePartitioned'
runConsistencyCheck = false
waitForQuiescence = false
simBackupAgents = 'BackupToFile'
clearAfterTest = false

[[test.workload]]
testName = 'Restore'

# check consistency after restore
[[test]]
testTitle = 'CycleAfterRestore'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 10.0
skipSetup = true
expectedRate = 0
1 change: 0 additions & 1 deletion tests/slow/BackupCorrectnessPartitioned.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
testClass = "Backup"

[configuration]
buggify = false
tenantModes = ['disabled'] # Do not support tenant
encryptModes = ['disabled'] # Do not support encryption

74 changes: 74 additions & 0 deletions tests/slow/BackupNewAndOldRestore.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
testClass = "Backup"

[configuration]
tenantModes = ['disabled'] # Do not support tenant
encryptModes = ['disabled'] # Do not support encryption

[[test]]
testTitle = 'NewBackup'
clearAfterTest = false
simBackupAgents = 'BackupToFile'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 30.0
expectedRate = 0

[[test.workload]]
testName = 'Backup'
usePartitionedLog = true
encrypted = false
backupTag = 'newBackup'
backupAfter = 10.0
restoreAfter = 60.0

[[test]]
testTitle = 'OldBackup'
runConsistencyCheck = false
waitForQuiescence = false
clearAfterTest = false
simBackupAgents = 'BackupToFile'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 30.0
skipSetup = true
expectedRate = 0

[[test.workload]]
testName = 'Backup'
usePartitionedLog = false
encrypted = false
backupTag = 'oldBackup'
backupAfter = 10.0
restoreAfter = 60.0

# Randomly pick one of the backup tag and restore it.
[[test]]
testTitle = 'RestoreRandomBackup'
runConsistencyCheck = false
waitForQuiescence = false
simBackupAgents = 'BackupToFile'
clearAfterTest = false

[[test.workload]]
testName = 'Restore'
backupTag1 = 'newBackup'
backupTag2 = 'oldBackup'
encrypted = false

# check consistency after restore
[[test]]
testTitle = 'CycleAfterRestore'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 10.0
skipSetup = true
expectedRate = 0
77 changes: 77 additions & 0 deletions tests/slow/BackupOldAndNewRestore.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
testClass = "Backup"

[configuration]
tenantModes = ['disabled'] # Do not support tenant
encryptModes = ['disabled'] # Do not support encryption

[[test]]
testTitle = 'OldBackup'
runConsistencyCheck = false
waitForQuiescence = false
clearAfterTest = false
simBackupAgents = 'BackupToFile'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 30.0
expectedRate = 0

# TODO: pass encrypted file across backup and restore workload
[[test.workload]]
testName = 'Backup'
usePartitionedLog = false
encrypted = false
backupTag = 'oldBackup'
backupAfter = 10.0
restoreAfter = 60.0

[[test]]
testTitle = 'NewBackup'
runConsistencyCheck = false
waitForQuiescence = false
clearAfterTest = false
simBackupAgents = 'BackupToFile'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 30.0
skipSetup = true
expectedRate = 0

[[test.workload]]
testName = 'Backup'
usePartitionedLog = true
encrypted = false
backupTag = 'newBackup'
backupAfter = 10.0
restoreAfter = 60.0

# Randomly pick one of the backup tag and restore it.
[[test]]
testTitle = 'RestoreRandomBackup'
runConsistencyCheck = false
waitForQuiescence = false
simBackupAgents = 'BackupToFile'
clearAfterTest = false

[[test.workload]]
testName = 'Restore'
backupTag1 = 'oldBackup'
backupTag2 = 'newBackup'
encrypted = false

# check consistency after restore
[[test]]
testTitle = 'CycleAfterRestore'

[[test.workload]]
testName = 'Cycle'
nodeCount = 3000
transactionsPerSecond = 2500.0
testDuration = 10.0
skipSetup = true
expectedRate = 0