Skip to content

Commit 10e34ed

Browse files
Support multiple range
1 parent f4cf27e commit 10e34ed

File tree

5 files changed

+160
-27
lines changed

5 files changed

+160
-27
lines changed

fdbclient/ServerKnobs.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
623623
// Block cache key-value checksum. Checksum is validated during read, so has non-trivial impact on read performance.
624624
init( ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY, 0 ); if ( randomize && BUGGIFY ) ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY = 8; // Default: 0 (disabled). Supported values: 0, 1, 2, 4, 8.
625625
init( ROCKSDB_ENABLE_NONDETERMINISM, false );
626+
init( SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES, false );
626627
init( SHARDED_ROCKSDB_ALLOW_WRITE_STALL_ON_FLUSH, false );
627628
init( SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO, 0.01 ); if (isSimulated) SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO = deterministicRandom()->random01();
628629
init( SHARD_METADATA_SCAN_BYTES_LIMIT, 10485760 ); // 10MB

fdbclient/include/fdbclient/ServerKnobs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ServerKnobs : public KnobsImpl<ServerKno
607607
// Note that turning this on in simulation could lead to non-deterministic runs
608608
// since we rely on rocksdb metadata. This knob also applies to sharded rocks
609609
// storage engine.
610+
bool SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES;
610611
bool SHARDED_ROCKSDB_ALLOW_WRITE_STALL_ON_FLUSH;
611612
int SHARDED_ROCKSDB_MEMTABLE_MAX_RANGE_DELETIONS;
612613
double SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO;

fdbserver/KeyValueStoreShardedRocksDB.actor.cpp

Lines changed: 150 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,7 +1546,7 @@ class ShardManager {
15461546
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
15471547
if (it.value()) {
15481548
if (it.value()->physicalShard->id == id) {
1549-
TraceEvent(SevError, "ShardedRocksDBAddRange")
1549+
TraceEvent(SevWarn, "ShardedRocksDBAddRange")
15501550
.detail("ErrorType", "RangeAlreadyExist")
15511551
.detail("IntersectingRange", it->range())
15521552
.detail("DataShardRange", it->value()->range)
@@ -1564,15 +1564,45 @@ class ShardManager {
15641564
}
15651565
}
15661566

1567-
auto currentCfOptions = active ? rState->getCFOptions() : rState->getCFOptionsForInactiveShard();
1568-
auto [it, inserted] = physicalShards.emplace(id, std::make_shared<PhysicalShard>(db, id, currentCfOptions));
1569-
std::shared_ptr<PhysicalShard>& shard = it->second;
1567+
auto it = physicalShards.find(id);
1568+
std::shared_ptr<PhysicalShard> physicalShard = nullptr;
1569+
if (it != physicalShards.end()) {
1570+
physicalShard = it->second;
1571+
// TODO: consider coalescing continous key ranges.
1572+
if (!SERVER_KNOBS->SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES) {
1573+
bool continous = physicalShard->dataShards.empty();
1574+
std::string rangeStr = "";
1575+
for (auto&[_, shard]: physicalShard->dataShards) {
1576+
rangeStr += shard->range.toString() + ", ";
1577+
if (shard->range.begin < range.begin && shard->range.end == range.begin) {
1578+
continous = true;
1579+
break;
1580+
}
1581+
if (shard->range.begin > range.begin && range.end == shard->range.begin) {
1582+
continous = true;
1583+
break;
1584+
}
1585+
}
1586+
if (!continous) {
1587+
// When multiple shards are merged into a single shard, the storage server might already own a piece
1588+
// of the resulting shard. Because intra storage server move is disabled, the merge data move could
1589+
// create multiple segments in a single physcial shard.
1590+
TraceEvent("AddMultipleRanges")
1591+
.detail("NewRange", range)
1592+
.detail("OtherRanges", rangeStr)
1593+
.setMaxFieldLength(1000);
1594+
}
1595+
}
1596+
} else {
1597+
auto currentCfOptions = active ? rState->getCFOptions() : rState->getCFOptionsForInactiveShard();
1598+
auto [it, inserted] = physicalShards.emplace(id, std::make_shared<PhysicalShard>(db, id, currentCfOptions));
1599+
physicalShard = it->second;
1600+
}
15701601

15711602
activePhysicalShardIds.emplace(id);
1572-
1573-
auto dataShard = std::make_unique<DataShard>(range, shard.get());
1603+
auto dataShard = std::make_unique<DataShard>(range, physicalShard.get());
15741604
dataShardMap.insert(range, dataShard.get());
1575-
shard->dataShards[range.begin.toString()] = std::move(dataShard);
1605+
physicalShard->dataShards[range.begin.toString()] = std::move(dataShard);
15761606

15771607
validate();
15781608

@@ -1581,7 +1611,7 @@ class ShardManager {
15811611
.detail("ShardId", id)
15821612
.detail("Active", active);
15831613

1584-
return shard.get();
1614+
return physicalShard.get();
15851615
}
15861616

15871617
std::vector<std::string> removeRange(KeyRange range) {
@@ -1636,6 +1666,7 @@ class ShardManager {
16361666

16371667
// Range modification could result in more than one segments. Remove the original segment key here.
16381668
existingShard->dataShards.erase(shardRange.begin.toString());
1669+
int count = 0;
16391670
if (shardRange.begin < range.begin) {
16401671
auto dataShard =
16411672
std::make_unique<DataShard>(KeyRange(KeyRangeRef(shardRange.begin, range.begin)), existingShard);
@@ -1646,6 +1677,7 @@ class ShardManager {
16461677

16471678
existingShard->dataShards[shardRange.begin.toString()] = std::move(dataShard);
16481679
logShardEvent(existingShard->id, shardRange, ShardOp::MODIFY_RANGE, SevInfo, msg);
1680+
count++;
16491681
}
16501682

16511683
if (shardRange.end > range.end) {
@@ -1658,6 +1690,17 @@ class ShardManager {
16581690

16591691
existingShard->dataShards[range.end.toString()] = std::move(dataShard);
16601692
logShardEvent(existingShard->id, shardRange, ShardOp::MODIFY_RANGE, SevInfo, msg);
1693+
count++;
1694+
}
1695+
1696+
if (count > 1) {
1697+
// During shard split, a shard could be split into multiple key ranges. One of the key ranges will
1698+
// remain on the storage server, other key ranges will be moved to new server. Depending on the order of
1699+
// executing the split data moves, a shard could be break into multiple pieces. Eventually a single
1700+
// continous key range will remain on the physical shard. Data consistency is guaranteed.
1701+
//
1702+
// For team based shard placement, we expect multiple data shards to be located on the same physical shard.
1703+
TraceEvent("RangeSplit").detail("OriginalRange", shardRange).detail("RemovedRange", range);
16611704
}
16621705
}
16631706

@@ -1986,28 +2029,37 @@ class ShardManager {
19862029
}
19872030

19882031
TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId);
2032+
int totalDataShards = 0;
2033+
int expectedDataShards = 0;
19892034
for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) {
19902035
TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId);
19912036
e.detail("Range", s->range());
19922037
const DataShard* shard = s->value();
19932038
e.detail("ShardAddress", reinterpret_cast<std::uintptr_t>(shard));
1994-
if (shard != nullptr) {
1995-
e.detail("PhysicalShard", shard->physicalShard->id);
1996-
} else {
1997-
e.detail("Shard", "Empty");
2039+
if (shard == nullptr) {
2040+
e.detail("RangeUnassigned", "True");
2041+
continue;
19982042
}
1999-
if (shard != nullptr) {
2000-
if (shard->range != static_cast<KeyRangeRef>(s->range())) {
2001-
TraceEvent(SevWarnAlways, "ShardRangeMismatch").detail("Range", s->range());
2002-
}
2003-
2004-
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
2005-
ASSERT(shard->physicalShard != nullptr);
2006-
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
2007-
ASSERT(it != shard->physicalShard->dataShards.end());
2008-
ASSERT(it->second.get() == shard);
2043+
totalDataShards++;
2044+
if (shard->range != static_cast<KeyRangeRef>(s->range())) {
2045+
TraceEvent(SevWarnAlways, "ShardRangeMismatch")
2046+
.detail("Range", s->range())
2047+
.detail("DataShardRange", shard->range)
2048+
.detail("PhysicalShardId", shard->physicalShard->id);
20092049
}
2050+
2051+
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
2052+
ASSERT(shard->physicalShard != nullptr);
2053+
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
2054+
ASSERT(it != shard->physicalShard->dataShards.end());
2055+
ASSERT(it->second.get() == shard);
2056+
}
2057+
2058+
for (auto [shardId, physicalShard] : physicalShards) {
2059+
ASSERT(physicalShard);
2060+
expectedDataShards += physicalShard->dataShards.size();
20102061
}
2062+
ASSERT_EQ(expectedDataShards, totalDataShards);
20112063
}
20122064

20132065
private:
@@ -4403,6 +4455,81 @@ TEST_CASE("noSim/ShardedRocksDB/Metadata") {
44034455
return Void();
44044456
}
44054457

4458+
TEST_CASE("noSim/ShardedRocksDBRangeOps/RemoveSplitRange") {
4459+
state std::string rocksDBTestDir = "sharded-rocksdb-kvs-test-db";
4460+
platform::eraseDirectoryRecursive(rocksDBTestDir);
4461+
4462+
state ShardedRocksDBKeyValueStore* rocksdbStore =
4463+
new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
4464+
state IKeyValueStore* kvStore = rocksdbStore;
4465+
wait(kvStore->init());
4466+
4467+
// Add two ranges to the same shard.
4468+
{
4469+
std::vector<Future<Void>> addRangeFutures;
4470+
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("a"_sr, "d"_sr), "shard-1"));
4471+
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("g"_sr, "n"_sr), "shard-1"));
4472+
4473+
wait(waitForAll(addRangeFutures));
4474+
}
4475+
4476+
state std::set<std::string> originalKeys = { "a", "b", "c", "g", "h", "m" };
4477+
state std::set<std::string> currentKeys = originalKeys;
4478+
for (auto key : originalKeys) {
4479+
kvStore->set(KeyValueRef(key, key));
4480+
}
4481+
wait(kvStore->commit());
4482+
4483+
state std::string key;
4484+
for (key : currentKeys) {
4485+
Optional<Value> val = wait(kvStore->readValue(key));
4486+
ASSERT(val.present());
4487+
ASSERT(val.get().toString() == key);
4488+
}
4489+
4490+
// Remove single range.
4491+
std::vector<std::string> shardsToCleanUp;
4492+
auto shardIds = kvStore->removeRange(KeyRangeRef("b"_sr, "c"_sr));
4493+
// Remove range didn't create empty shard.
4494+
ASSERT_EQ(shardIds.size(), 0);
4495+
4496+
currentKeys.erase("b");
4497+
for (key : originalKeys) {
4498+
Optional<Value> val = wait(kvStore->readValue(key));
4499+
if (currentKeys.contains(key)) {
4500+
ASSERT(val.present());
4501+
ASSERT(val.get().toString() == key);
4502+
} else {
4503+
ASSERT(!val.present());
4504+
}
4505+
}
4506+
4507+
// Remove range spanning on multple sub range.
4508+
auto shardIds = kvStore->removeRange(KeyRangeRef("c"_sr, "k"_sr));
4509+
ASSERT(shardIds.empty());
4510+
4511+
currentKeys.erase("c");
4512+
currentKeys.erase("g");
4513+
currentKeys.erase("h");
4514+
for (key : originalKeys) {
4515+
Optional<Value> val = wait(kvStore->readValue(key));
4516+
if (currentKeys.contains(key)) {
4517+
ASSERT(val.present());
4518+
ASSERT(val.get().toString() == key);
4519+
} else {
4520+
ASSERT(!val.present());
4521+
}
4522+
}
4523+
4524+
{
4525+
Future<Void> closed = kvStore->onClosed();
4526+
kvStore->dispose();
4527+
wait(closed);
4528+
}
4529+
ASSERT(!directoryExists(rocksDBTestDir));
4530+
return Void();
4531+
}
4532+
44064533
TEST_CASE("noSim/ShardedRocksDBCheckpoint/CheckpointBasic") {
44074534
state std::string rocksDBTestDir = "sharded-rocks-checkpoint-restore";
44084535
state std::map<Key, Value> kvs({ { "a"_sr, "TestValueA"_sr },
@@ -4777,7 +4904,7 @@ ACTOR Future<Void> testWrites(IKeyValueStore* kvStore, int writeCount) {
47774904
state int i = 0;
47784905

47794906
while (i < writeCount) {
4780-
state int endCount = deterministicRandom()->randomInt(i, i + 1000);
4907+
state int endCount = deterministicRandom()->randomInt(i+1, i + 1000);
47814908
state std::string beginKey = format("key-%6d", i);
47824909
state std::string endKey = format("key-%6d", endCount);
47834910
wait(kvStore->addRange(KeyRangeRef(beginKey, endKey), deterministicRandom()->randomUniqueID().toString()));

fdbserver/storageserver.actor.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11146,7 +11146,8 @@ void changeServerKeysWithPhysicalShards(StorageServer* data,
1114611146
.detail("Range", keys)
1114711147
.detail("NowAssigned", nowAssigned)
1114811148
.detail("Version", cVer)
11149-
.detail("ResultingShard", newShard.toString());
11149+
.detail("ResultingShard", newShard.toString())
11150+
.detail("ShardIdDifferent", newShard.id != newShard.desiredId);
1115011151
} else if (currentShard->adding) {
1115111152
if (nowAssigned) {
1115211153
TraceEvent(sev, "PhysicalShardStateError")
@@ -11246,9 +11247,10 @@ void changeServerKeysWithPhysicalShards(StorageServer* data,
1124611247
.detail("NewShard", updatedShards.back().toString());
1124711248
} else if (!dataAvailable) {
1124811249
if (version == data->initialClusterVersion - 1) {
11249-
TraceEvent(sevDm, "CSKWithPhysicalShardsSeedRange", data->thisServerID)
11250+
TraceEvent(SevInfo, "CSKWithPhysicalShardsSeedRange", data->thisServerID)
1125011251
.detail("ShardID", desiredId)
11251-
.detail("Range", range);
11252+
.detail("Range", range)
11253+
.detail("Version", cVer);
1125211254
changeNewestAvailable.emplace_back(range, latestVersion);
1125311255
updatedShards.push_back(
1125411256
StorageServerShard(range, version, desiredId, desiredId, StorageServerShard::ReadWrite));

tests/noSim/ShardedRocksDBTest.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
rocksdb_disable_auto_compactions = true
33
rocksdb_suggest_compact_clear_range = false
44
rocksdb_empty_range_check = false
5+
sharded_rocksdb_validate_mapping_ratio=1.0
6+
sharded_rocksdb_allow_multiple_ranges=true
57

68
[[test]]
79
testTitle = 'UnitTests'
@@ -11,4 +13,4 @@ rocksdb_empty_range_check = false
1113
[[test.workload]]
1214
testName = 'UnitTests'
1315
maxTestCases = 10
14-
testsMatching = 'noSim/ShardedRocksDB/'
16+
testsMatching = 'noSim/ShardedRocksDB/'

0 commit comments

Comments
 (0)