Skip to content

Commit da665bd

Browse files
Support multiple range
1 parent c289917 commit da665bd

File tree

4 files changed

+145
-24
lines changed

4 files changed

+145
-24
lines changed

fdbclient/ServerKnobs.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
623623
// Block cache key-value checksum. Checksum is validated during read, so has non-trivial impact on read performance.
624624
init( ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY, 0 ); if ( randomize && BUGGIFY ) ROCKSDB_BLOCK_PROTECTION_BYTES_PER_KEY = 8; // Default: 0 (disabled). Supported values: 0, 1, 2, 4, 8.
625625
init( ROCKSDB_ENABLE_NONDETERMINISM, false );
626+
init( SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES, false );
626627
init( SHARDED_ROCKSDB_ALLOW_WRITE_STALL_ON_FLUSH, false );
627628
init( SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO, 0.01 ); if (isSimulated) SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO = deterministicRandom()->random01();
628629
init( SHARD_METADATA_SCAN_BYTES_LIMIT, 10485760 ); // 10MB

fdbclient/include/fdbclient/ServerKnobs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ServerKnobs : public KnobsImpl<ServerKno
607607
// Note that turning this on in simulation could lead to non-deterministic runs
608608
// since we rely on rocksdb metadata. This knob also applies to sharded rocks
609609
// storage engine.
610+
bool SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES;
610611
bool SHARDED_ROCKSDB_ALLOW_WRITE_STALL_ON_FLUSH;
611612
int SHARDED_ROCKSDB_MEMTABLE_MAX_RANGE_DELETIONS;
612613
double SHARDED_ROCKSDB_VALIDATE_MAPPING_RATIO;

fdbserver/KeyValueStoreShardedRocksDB.actor.cpp

Lines changed: 139 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,7 +1546,7 @@ class ShardManager {
15461546
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
15471547
if (it.value()) {
15481548
if (it.value()->physicalShard->id == id) {
1549-
TraceEvent(SevError, "ShardedRocksDBAddRange")
1549+
TraceEvent(SevWarn, "ShardedRocksDBAddRange")
15501550
.detail("ErrorType", "RangeAlreadyExist")
15511551
.detail("IntersectingRange", it->range())
15521552
.detail("DataShardRange", it->value()->range)
@@ -1564,15 +1564,34 @@ class ShardManager {
15641564
}
15651565
}
15661566

1567-
auto currentCfOptions = active ? rState->getCFOptions() : rState->getCFOptionsForInactiveShard();
1568-
auto [it, inserted] = physicalShards.emplace(id, std::make_shared<PhysicalShard>(db, id, currentCfOptions));
1569-
std::shared_ptr<PhysicalShard>& shard = it->second;
1567+
auto it = physicalShards.find(id);
1568+
std::shared_ptr<PhysicalShard> physicalShard = nullptr;
1569+
if (it != physicalShards.end()) {
1570+
physicalShard = it->second;
1571+
if (SERVER_KNOBS->SHARDED_ROCKSDB_ALLOW_MULTIPLE_RANGES) {
1572+
bool continous = false;
1573+
for (auto&[_, shard]: physicalShard->dataShards) {
1574+
if (shard->range.begin < range.begin && shard->range.end == range.begin) {
1575+
continous = true;
1576+
break;
1577+
}
1578+
if (shard->range.begin > range.begin && range.end == shard->range.begin) {
1579+
continous = true;
1580+
break;
1581+
}
1582+
}
1583+
ASSERT_WE_THINK(continous);
1584+
}
1585+
} else {
1586+
auto currentCfOptions = active ? rState->getCFOptions() : rState->getCFOptionsForInactiveShard();
1587+
auto [it, inserted] = physicalShards.emplace(id, std::make_shared<PhysicalShard>(db, id, currentCfOptions));
1588+
physicalShard = it->second;
1589+
}
15701590

15711591
activePhysicalShardIds.emplace(id);
1572-
1573-
auto dataShard = std::make_unique<DataShard>(range, shard.get());
1592+
auto dataShard = std::make_unique<DataShard>(range, physicalShard.get());
15741593
dataShardMap.insert(range, dataShard.get());
1575-
shard->dataShards[range.begin.toString()] = std::move(dataShard);
1594+
physicalShard->dataShards[range.begin.toString()] = std::move(dataShard);
15761595

15771596
validate();
15781597

@@ -1581,7 +1600,7 @@ class ShardManager {
15811600
.detail("ShardId", id)
15821601
.detail("Active", active);
15831602

1584-
return shard.get();
1603+
return physicalShard.get();
15851604
}
15861605

15871606
std::vector<std::string> removeRange(KeyRange range) {
@@ -1636,6 +1655,7 @@ class ShardManager {
16361655

16371656
// Range modification could result in more than one segments. Remove the original segment key here.
16381657
existingShard->dataShards.erase(shardRange.begin.toString());
1658+
int count = 0;
16391659
if (shardRange.begin < range.begin) {
16401660
auto dataShard =
16411661
std::make_unique<DataShard>(KeyRange(KeyRangeRef(shardRange.begin, range.begin)), existingShard);
@@ -1646,6 +1666,7 @@ class ShardManager {
16461666

16471667
existingShard->dataShards[shardRange.begin.toString()] = std::move(dataShard);
16481668
logShardEvent(existingShard->id, shardRange, ShardOp::MODIFY_RANGE, SevInfo, msg);
1669+
count++;
16491670
}
16501671

16511672
if (shardRange.end > range.end) {
@@ -1658,6 +1679,17 @@ class ShardManager {
16581679

16591680
existingShard->dataShards[range.end.toString()] = std::move(dataShard);
16601681
logShardEvent(existingShard->id, shardRange, ShardOp::MODIFY_RANGE, SevInfo, msg);
1682+
count++;
1683+
}
1684+
1685+
if (count > 1) {
1686+
// During shard split, a shard could be split into multiple key ranges. One of the key ranges will
1687+
// remain on the storage server, other key ranges will be moved to new server. Depending on the order of
1688+
// executing the split data moves, a shard could be break into multiple pieces. Eventually a single
1689+
// continous key range will remain on the physical shard. Data consistency is guaranteed.
1690+
//
1691+
// For team based shard placement, we expect multiple data shards to be located on the same physical shard.
1692+
TraceEvent("RangeSplit").detail("OriginalRange", shardRange).detail("RemovedRange", range);
16611693
}
16621694
}
16631695

@@ -1986,28 +2018,37 @@ class ShardManager {
19862018
}
19872019

19882020
TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId);
2021+
int totalDataShards = 0;
2022+
int expectedDataShards = 0;
19892023
for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) {
19902024
TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId);
19912025
e.detail("Range", s->range());
19922026
const DataShard* shard = s->value();
19932027
e.detail("ShardAddress", reinterpret_cast<std::uintptr_t>(shard));
1994-
if (shard != nullptr) {
1995-
e.detail("PhysicalShard", shard->physicalShard->id);
1996-
} else {
1997-
e.detail("Shard", "Empty");
2028+
if (shard == nullptr) {
2029+
e.detail("RangeUnassigned", "True");
2030+
continue;
19982031
}
1999-
if (shard != nullptr) {
2000-
if (shard->range != static_cast<KeyRangeRef>(s->range())) {
2001-
TraceEvent(SevWarnAlways, "ShardRangeMismatch").detail("Range", s->range());
2002-
}
2003-
2004-
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
2005-
ASSERT(shard->physicalShard != nullptr);
2006-
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
2007-
ASSERT(it != shard->physicalShard->dataShards.end());
2008-
ASSERT(it->second.get() == shard);
2032+
totalDataShards++;
2033+
if (shard->range != static_cast<KeyRangeRef>(s->range())) {
2034+
TraceEvent(SevWarnAlways, "ShardRangeMismatch")
2035+
.detail("Range", s->range())
2036+
.detail("DataShardRange", shard->range)
2037+
.detail("PhysicalShardId", shard->physicalShard->id);
20092038
}
2039+
2040+
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
2041+
ASSERT(shard->physicalShard != nullptr);
2042+
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
2043+
ASSERT(it != shard->physicalShard->dataShards.end());
2044+
ASSERT(it->second.get() == shard);
20102045
}
2046+
2047+
for (auto [shardId, physicalShard] : physicalShards) {
2048+
ASSERT(physicalShard);
2049+
expectedDataShards += physicalShard->dataShards.size();
2050+
}
2051+
ASSERT_EQ(expectedDataShards, totalDataShards);
20112052
}
20122053

20132054
private:
@@ -4403,6 +4444,81 @@ TEST_CASE("noSim/ShardedRocksDB/Metadata") {
44034444
return Void();
44044445
}
44054446

4447+
TEST_CASE("noSim/ShardedRocksDBRangeOps/RemoveSplitRange") {
4448+
state std::string rocksDBTestDir = "sharded-rocksdb-kvs-test-db";
4449+
platform::eraseDirectoryRecursive(rocksDBTestDir);
4450+
4451+
state ShardedRocksDBKeyValueStore* rocksdbStore =
4452+
new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
4453+
state IKeyValueStore* kvStore = rocksdbStore;
4454+
wait(kvStore->init());
4455+
4456+
// Add two ranges to the same shard.
4457+
{
4458+
std::vector<Future<Void>> addRangeFutures;
4459+
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("a"_sr, "d"_sr), "shard-1"));
4460+
addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("g"_sr, "n"_sr), "shard-1"));
4461+
4462+
wait(waitForAll(addRangeFutures));
4463+
}
4464+
4465+
state std::set<std::string> originalKeys = { "a", "b", "c", "g", "h", "m" };
4466+
state std::set<std::string> currentKeys = originalKeys;
4467+
for (auto key : originalKeys) {
4468+
kvStore->set(KeyValueRef(key, key));
4469+
}
4470+
wait(kvStore->commit());
4471+
4472+
state std::string key;
4473+
for (key : currentKeys) {
4474+
Optional<Value> val = wait(kvStore->readValue(key));
4475+
ASSERT(val.present());
4476+
ASSERT(val.get().toString() == key);
4477+
}
4478+
4479+
// Remove single range.
4480+
std::vector<std::string> shardsToCleanUp;
4481+
auto shardIds = kvStore->removeRange(KeyRangeRef("b"_sr, "c"_sr));
4482+
// Remove range didn't create empty shard.
4483+
ASSERT_EQ(shardIds.size(), 0);
4484+
4485+
currentKeys.erase("b");
4486+
for (key : originalKeys) {
4487+
Optional<Value> val = wait(kvStore->readValue(key));
4488+
if (currentKeys.contains(key)) {
4489+
ASSERT(val.present());
4490+
ASSERT(val.get().toString() == key);
4491+
} else {
4492+
ASSERT(!val.present());
4493+
}
4494+
}
4495+
4496+
// Remove range spanning on multple sub range.
4497+
auto shardIds = kvStore->removeRange(KeyRangeRef("c"_sr, "k"_sr));
4498+
ASSERT(shardIds.empty());
4499+
4500+
currentKeys.erase("c");
4501+
currentKeys.erase("g");
4502+
currentKeys.erase("h");
4503+
for (key : originalKeys) {
4504+
Optional<Value> val = wait(kvStore->readValue(key));
4505+
if (currentKeys.contains(key)) {
4506+
ASSERT(val.present());
4507+
ASSERT(val.get().toString() == key);
4508+
} else {
4509+
ASSERT(!val.present());
4510+
}
4511+
}
4512+
4513+
{
4514+
Future<Void> closed = kvStore->onClosed();
4515+
kvStore->dispose();
4516+
wait(closed);
4517+
}
4518+
ASSERT(!directoryExists(rocksDBTestDir));
4519+
return Void();
4520+
}
4521+
44064522
TEST_CASE("noSim/ShardedRocksDBCheckpoint/CheckpointBasic") {
44074523
state std::string rocksDBTestDir = "sharded-rocks-checkpoint-restore";
44084524
state std::map<Key, Value> kvs({ { "a"_sr, "TestValueA"_sr },
@@ -4777,7 +4893,7 @@ ACTOR Future<Void> testWrites(IKeyValueStore* kvStore, int writeCount) {
47774893
state int i = 0;
47784894

47794895
while (i < writeCount) {
4780-
state int endCount = deterministicRandom()->randomInt(i, i + 1000);
4896+
state int endCount = deterministicRandom()->randomInt(i+1, i + 1000);
47814897
state std::string beginKey = format("key-%6d", i);
47824898
state std::string endKey = format("key-%6d", endCount);
47834899
wait(kvStore->addRange(KeyRangeRef(beginKey, endKey), deterministicRandom()->randomUniqueID().toString()));

tests/noSim/ShardedRocksDBTest.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
rocksdb_disable_auto_compactions = true
33
rocksdb_suggest_compact_clear_range = false
44
rocksdb_empty_range_check = false
5+
sharded_rocksdb_validate_mapping_ratio=1.0
6+
sharded_rocksdb_allow_multiple_ranges=true
57

68
[[test]]
79
testTitle = 'UnitTests'
@@ -11,4 +13,5 @@ rocksdb_empty_range_check = false
1113
[[test.workload]]
1214
testName = 'UnitTests'
1315
maxTestCases = 10
14-
testsMatching = 'noSim/ShardedRocksDB/'
16+
# testsMatching = 'noSim/ShardedRocksDB/'
17+
testsMatching = 'noSim/ShardedRocksDBRangeOps/RemoveSplitRange'

0 commit comments

Comments
 (0)