Skip to content

Ingest sst files rather than their keyvalue content. #12079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bindings/c/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ if(NOT WIN32)
add_executable(fdb_c_client_config_tester test/client_config_tester.cpp)
endif()

target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
set_target_properties(fdb_c_performance_test PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads stdc++)
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)
Expand Down
7 changes: 6 additions & 1 deletion fdbclient/ServerKnobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_MEMTABLE_PROTECTION_BYTES_PER_KEY, 0 ); if ( randomize && BUGGIFY ) ROCKSDB_MEMTABLE_PROTECTION_BYTES_PER_KEY = 8; // Default: 0 (disabled). Supported values: 0, 1, 2, 4, 8.
// Block cache key-value checksum. Checksum is validated during read, so has non-trivial impact on read performance.
init( ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY, 0 ); if ( randomize && BUGGIFY ) ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY = 8; // Default: 0 (disabled). Supported values: 0, 1, 2, 4, 8.
init( ROCKSDB_ENABLE_NONDETERMINISM, false );
init( ROCKSDB_ENABLE_NONDETERMINISM, false );
init( SHARDED_ROCKSDB_ALLOW_WRITE_STALL_ON_FLUSH, false );
init( SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO, 0.01 ); if (isSimulated) SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO = deterministicRandom()->random01();
init( SHARD_METADATA_SCAN_BYTES_LIMIT, 10485760 ); // 10MB
Expand Down Expand Up @@ -1410,4 +1410,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
(5.0 * VERSIONS_PER_SECOND);
clientKnobs->INIT_MID_SHARD_BYTES = MIN_SHARD_BYTES;
}

init(BULK_LOAD_USE_SST_INGEST, true); // Enable SST ingestion by default
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you do if (isSimulated) BULK_LOAD_USE_SST_INGEST = deterministicRandom()->coinflip() to improve the test coverage?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if (isSimulated) {
BULK_LOAD_USE_SST_INGEST = deterministicRandom()->coinflip();
}
}
12 changes: 12 additions & 0 deletions fdbclient/include/fdbclient/IKeyValueStore.actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class IKeyValueStore : public IClosable {
// Returns true if the KV store supports shards, i.e., implements addRange(), removeRange(), and
// persistRangeMapping().
virtual bool shardAware() const { return false; }
// Returns true if the store supports external SST file ingestion.
virtual bool supportsSstIngestion() const { return false; }
virtual void set(KeyValueRef keyValue, const Arena* arena = nullptr) = 0;
virtual void clear(KeyRangeRef range, const Arena* arena = nullptr) = 0;
virtual Future<Void> canCommit() { return Void(); }
Expand Down Expand Up @@ -134,6 +136,9 @@ class IKeyValueStore : public IClosable {
// Delete a checkpoint.
virtual Future<Void> deleteCheckpoint(const CheckpointMetaData& checkpoint) { throw not_implemented(); }

// Compact a range of keys in the store
virtual Future<Void> compactRange(KeyRangeRef range) { throw not_implemented(); }

/*
Concurrency contract
Causal consistency:
Expand All @@ -157,6 +162,13 @@ class IKeyValueStore : public IClosable {
// Obtain the encryption mode of the storage. The encryption mode needs to match the encryption mode of the cluster.
virtual Future<EncryptionAtRestMode> encryptionMode() = 0;

// the files in localFileSets.
// Throws an error if the store does not support SST ingestion or if ingestion fails.
// It is the responsibility of the caller to ensure the directory exists and the fileSetMap is valid.
virtual Future<Void> ingestSSTFiles(std::shared_ptr<BulkLoadFileSetKeyMap> localFileSets) {
throw not_implemented();
}

protected:
virtual ~IKeyValueStore() {}
};
Expand Down
2 changes: 2 additions & 0 deletions fdbclient/include/fdbclient/ServerKnobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1456,6 +1456,8 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ServerKnobs : public KnobsImpl<ServerKno
// Swift: Enable the Swift runtime hooks and use Swift implementations where possible
bool FLOW_WITH_SWIFT;

bool BULK_LOAD_USE_SST_INGEST; // Enable direct SST file ingestion for RocksDB storage engines

ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated);
};
2 changes: 1 addition & 1 deletion fdbclient/tests/fdb_cluster_fixture.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ function start_fdb_cluster {
"${local_build_dir}" \
--knobs "${knobs}" \
--stateless_count 1 --replication_count 1 --logs_count 1 \
--storage_count "${ss_count}" --storage_type ssd \
--storage_count "${ss_count}" --storage_type ssd-rocksdb-v1 \
Copy link
Member

@kakaiu kakaiu Apr 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to randomly choose between sqlite and rocksdb? for testing coverage

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dunno. rocksdb is the standard, not sqllite.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want bulkload feature to stably work for both sqlite and rocksdb

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me make a new PR for this that allows setting of storage engine to use in ctest.

--dump_pids on \
> >(tee "${output}") \
2> >(tee "${output}" >&2)
Expand Down
139 changes: 138 additions & 1 deletion fdbserver/KeyValueStoreRocksDB.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
#include <memory>
#include <tuple>
#include <vector>
#include <fstream>

#endif // WITH_ROCKSDB

Expand Down Expand Up @@ -1318,6 +1319,79 @@ struct RocksDBKeyValueStore : IKeyValueStore {

void init() override {}

struct IngestSSTFilesAction : TypedAction<Writer, IngestSSTFilesAction> {
IngestSSTFilesAction(std::shared_ptr<BulkLoadFileSetKeyMap> localFileSets) : localFileSets(localFileSets) {}

double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }

std::shared_ptr<BulkLoadFileSetKeyMap> localFileSets;
ThreadReturnPromise<Void> done;
};

void action(IngestSSTFilesAction& a) {
// Create a list of SST files to ingest
std::vector<std::string> sstFiles;
for (const auto& [range, fileSet] : *a.localFileSets) {
if (fileSet.hasDataFile()) {
sstFiles.push_back(fileSet.getDataFileFullPath());
}
}

if (sstFiles.empty()) {
TraceEvent(SevInfo, "RocksDBIngestSSTFilesNoFiles", id);
a.done.send(Void()); // Nothing to ingest
return;
}

// Configure ingestion options
rocksdb::IngestExternalFileOptions options;
options.move_files = true;
options.verify_checksums_before_ingest = true;

// Ingest the SST files
// The default column family parameter is necessary here; w/o it the ingested keyvalues are unreadable
rocksdb::Status status = db->IngestExternalFile(cf, sstFiles, options);

if (!status.ok()) {
logRocksDBError(id, status, "IngestSSTFiles");
a.done.sendError(statusToError(status));
return;
}

a.done.send(Void());
}

struct CompactRangeAction : TypedAction<Writer, CompactRangeAction> {
CompactRangeAction(KeyRangeRef range) : range(range) {}

double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }

const KeyRange range;
ThreadReturnPromise<Void> done;
};

void action(CompactRangeAction& a) {
// Configure compaction options
rocksdb::CompactRangeOptions options;
// Force RocksDB to rewrite file to last level
options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForceOptimized;

// Convert key range to slices
auto begin = toSlice(a.range.begin);
auto end = toSlice(a.range.end);

// Perform the compaction
rocksdb::Status status = db->CompactRange(options, cf, &begin, &end);

if (!status.ok()) {
logRocksDBError(id, status, "CompactRange");
a.done.sendError(statusToError(status));
return;
}

a.done.send(Void());
}

struct OpenAction : TypedAction<Writer, OpenAction> {
std::string path;
ThreadReturnPromise<Void> done;
Expand Down Expand Up @@ -2141,6 +2215,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
void close() override { doClose(this, false); }

KeyValueStoreType getType() const override { return KeyValueStoreType(KeyValueStoreType::SSD_ROCKSDB_V1); }
bool supportsSstIngestion() const override { return true; }

Future<Void> init() override {
if (openFuture.isValid()) {
Expand Down Expand Up @@ -2492,6 +2567,20 @@ struct RocksDBKeyValueStore : IKeyValueStore {
return EncryptionAtRestMode(EncryptionAtRestMode::DISABLED);
}

Future<Void> ingestSSTFiles(std::shared_ptr<BulkLoadFileSetKeyMap> localFileSets) override {
auto a = new Writer::IngestSSTFilesAction(localFileSets);
auto res = a->done.getFuture();
writeThread->post(a);
return res;
}

Future<Void> compactRange(KeyRangeRef range) override {
auto a = new Writer::CompactRangeAction(range);
auto res = a->done.getFuture();
writeThread->post(a);
return res;
}

DB db = nullptr;
std::shared_ptr<SharedRocksDBState> sharedState;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
Expand Down Expand Up @@ -2963,6 +3052,54 @@ TEST_CASE("noSim/RocksDB/RangeClear") {
wait(closed);
return Void();
}
} // namespace

TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/IngestSSTFileVisibility") {
state std::string testDir = "test_ingest_sst_visibility";
state UID testStoreID = deterministicRandom()->randomUniqueID();
state RocksDBKeyValueStore* kvStore = new RocksDBKeyValueStore(testDir, testStoreID);

// Initialize the store
wait(kvStore->init());

// Create an SST file
state std::string sstFilename = "test.sst"; // Base filename
state std::string sstFileFullPath = joinPath(testDir, sstFilename); // Full path for writer
rocksdb::SstFileWriter sstWriter(rocksdb::EnvOptions(), kvStore->sharedState->getOptions());
ASSERT(sstWriter.Open(sstFileFullPath).ok()); // Use full path here
ASSERT(sstWriter.Put("test_key", "test_value").ok());
ASSERT(sstWriter.Finish().ok());

// Create and populate the file set map (which is a vector)
state std::shared_ptr<BulkLoadFileSetKeyMap> fileSetMap = std::make_shared<BulkLoadFileSetKeyMap>();
state std::string dummyManifestFile = "dummy_manifest.txt"; // Dummy filename for validation

// Create the BulkLoadFileSet using its constructor.
// Pass the test directory, dummy manifest, and the base SST filename.
BulkLoadFileSet fileSet(testDir, // rootPath
/*relativePath=*/"",
dummyManifestFile, // manifestFileName
sstFilename, // dataFileName (use base name)
/*byteSampleFileName=*/"",
BulkLoadChecksum()); // checksum

fileSetMap->emplace_back(allKeys, fileSet); // Use emplace_back for std::vector

// Ingest the SST file using the populated map
wait(kvStore->ingestSSTFiles(fileSetMap));

// Verify the key is visible
Optional<Value> value = wait(kvStore->readValue("test_key"_sr, Optional<ReadOptions>()));
ASSERT(value.present());
ASSERT(value.get() == "test_value"_sr);

// Clean up
Future<Void> closed = kvStore->onClosed(); // Get future before dispose
kvStore->dispose();
wait(closed); // Wait for close completion
platform::eraseDirectoryRecursive(testDir);

return Void();
}

} // namespace
#endif // WITH_ROCKSDB
Loading