Skip to content

Commit ddd724e

Browse files
committed
new seqno assignment
1 parent f6b2cdd commit ddd724e

File tree

6 files changed

+263
-161
lines changed

6 files changed

+263
-161
lines changed

include/rocksdb/utilities/write_batch_with_index.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class WBWIIterator {
7777

7878
virtual void Prev() = 0;
7979

80+
virtual Status status() const = 0;
81+
8082
// The returned WriteEntry is only valid until the next mutation of
8183
// WriteBatchWithIndex.
8284
virtual WriteEntry Entry() const = 0;
@@ -85,7 +87,9 @@ class WBWIIterator {
8587
// and it was overwritten by another update.
8688
virtual bool HasOverWrittenSingleDel() const { return false; }
8789

88-
virtual Status status() const = 0;
90+
// The number of times an update has issued for the current user key in this
91+
// write batch up to this entry.
92+
virtual uint32_t GetUpdateCount() const { return 0; }
8993
};
9094

9195
// A WriteBatchWithIndex with a binary searchable index built for all the keys

memtable/wbwi_memtable.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
6262

6363
[[maybe_unused]] SequenceNumber read_seq =
6464
GetInternalKeySeqno(key.internal_key());
65+
// This is memtable is a single write batch, no snapshot can be taken within
66+
// assigned seqnos for this memtable.
67+
assert(read_seq >= assigned_seqno_.upper_bound ||
68+
read_seq < assigned_seqno_.lower_bound);
6569
std::unique_ptr<InternalIterator> iter{NewIterator()};
6670
iter->Seek(key.internal_key());
6771
const Slice lookup_user_key = key.user_key();

memtable/wbwi_memtable.h

Lines changed: 47 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,27 @@ namespace ROCKSDB_NAMESPACE {
1313
// a transaction (which is based on WBWI) into the DB as an immutable memtable.
1414
//
1515
// REQUIRE overwrite_key to be true for the WBWI
16-
// Since the keys in WBWI do not have sequence number, the memtable needs to be
16+
// Since the keys in WBWI do not have sequence number, this class is responsible
17+
// for assigning sequence numbers to the keys. This memtable needs to be
1718
// assigned a range of sequence numbers through AssignSequenceNumbers(seqno)
1819
// before being available for reads.
19-
// With overwrite_key = true, WBWI keeps track of the most recent update for
20-
// each key, and each such key will be assigned seqno.upper_bound during reads.
21-
// One exception is during flush, consider the following scenario:
20+
//
21+
// The sequence number assignment uses the update count for each key
22+
// tracked in WBWI (see WBWIIterator::GetUpdateCount()). For each key, the
23+
// sequence number assigned is seqno.lower_bound + update_count - 1. So more
24+
// recent updates will have higher sequence number.
25+
//
26+
// There is a special case where this memtable needs to emit an extra
27+
// SingleDelete even when the SD is overwritten by another update.
28+
// Consider the following scenario:
2229
// - WBWI has SD(k) then PUT(k, v1)
2330
// - DB has PUT(k, v2) in L1
2431
// - flush WBWI adds PUT(k, v1) into L0
2532
// - live memtable contains SD(k)
2633
// - flush live memtable and compact it with L0 will drop SD(k) and PUT(k, v1)
2734
// - the PUT(k, v2) in L1 incorrectly becomes visible
2835
// So during flush, iterator from this memtable will need emit overwritten
29-
// single deletion. These single deletion entries will be
30-
// assigned seqno.upper_bound - 1.
36+
// single deletion. This SD will be assigned seqno.lower_bound.
3137
class WBWIMemTable final : public ReadOnlyMemTable {
3238
public:
3339
struct SeqnoRange {
@@ -258,51 +264,40 @@ class WBWIMemTableIterator final : public InternalIterator {
258264
}
259265

260266
void Seek(const Slice& target) override {
267+
// `emit_overwritten_single_del_` is only used for flush, which does
268+
// sequential forward scan from the beginning.
269+
assert(!emit_overwritten_single_del_);
261270
Slice target_user_key = ExtractUserKey(target);
271+
// Moves to first update >= target_user_key
262272
it_->Seek(target_user_key);
263-
if (it_->Valid()) {
264-
// compare seqno
265-
SequenceNumber seqno = GetInternalKeySeqno(target);
266-
assert(!emit_overwritten_single_del_);
267-
// For now all keys are assigned seqno_ub_, this may change after merge
268-
// is supported.
269-
assert(seqno <= assigned_seqno_.lower_bound ||
270-
seqno >= assigned_seqno_.upper_bound);
271-
if (seqno < assigned_seqno_.upper_bound &&
272-
comparator_->Compare(it_->Entry().key, target_user_key) == 0) {
273-
it_->Next();
274-
// TODO: cannot assume distinct keys once Merge is supported
275-
if (it_->Valid()) {
276-
assert(comparator_->Compare(it_->Entry().key, target_user_key) > 0);
277-
}
278-
}
273+
SequenceNumber target_seqno = GetInternalKeySeqno(target);
274+
// Compare seqno
275+
while (it_->Valid() &&
276+
comparator_->Compare(it_->Entry().key, target_user_key) == 0 &&
277+
target_seqno < CurrentKeySeqno()) {
278+
it_->Next();
279279
}
280280
UpdateKey();
281281
}
282282

283283
void SeekForPrev(const Slice& target) override {
284+
assert(!emit_overwritten_single_del_);
284285
Slice target_user_key = ExtractUserKey(target);
286+
// Moves to last update <= target_user_key
285287
it_->SeekForPrev(target_user_key);
286-
if (it_->Valid()) {
287-
SequenceNumber seqno = GetInternalKeySeqno(target);
288-
assert(seqno <= assigned_seqno_.lower_bound ||
289-
seqno >= assigned_seqno_.upper_bound);
290-
if (seqno > assigned_seqno_.upper_bound &&
291-
comparator_->Compare(it_->Entry().key, target_user_key) == 0) {
292-
it_->Prev();
293-
if (it_->Valid()) {
294-
// TODO: cannot assume distinct keys once Merge is supported
295-
assert(comparator_->Compare(it_->Entry().key, target_user_key) < 0);
296-
}
297-
}
288+
SequenceNumber target_seqno = GetInternalKeySeqno(target);
289+
// Need to move to the first entry with seqno >= target_seqno for the same
290+
// user key or a different user key.
291+
// Compare seqno
292+
while (it_->Valid() &&
293+
comparator_->Compare(it_->Entry().key, target_user_key) == 0 &&
294+
CurrentKeySeqno() < target_seqno) {
295+
it_->Prev();
298296
}
299297
UpdateKey();
300298
}
301299

302300
void Next() override {
303-
// Only need to emit single deletion during flush. Since Flush does
304-
// sequential forward scan, we only need to emit single deletion in Next(),
305-
// and do not need to consider iterator direction change.
306301
assert(Valid());
307302
if (emit_overwritten_single_del_) {
308303
if (it_->HasOverWrittenSingleDel() && !at_overwritten_single_del_) {
@@ -329,6 +324,7 @@ class WBWIMemTableIterator final : public InternalIterator {
329324
}
330325

331326
void Prev() override {
327+
assert(!emit_overwritten_single_del_);
332328
assert(Valid());
333329
it_->Prev();
334330
UpdateKey();
@@ -341,7 +337,6 @@ class WBWIMemTableIterator final : public InternalIterator {
341337

342338
Slice value() const override {
343339
assert(Valid());
344-
// TODO: it_->Entry() is not trivial, cache it
345340
return it_->Entry().value;
346341
}
347342

@@ -355,6 +350,16 @@ class WBWIMemTableIterator final : public InternalIterator {
355350
private:
356351
static const std::unordered_map<WriteType, ValueType> WriteTypeToValueTypeMap;
357352

353+
SequenceNumber CurrentKeySeqno() {
354+
assert(it_->Valid());
355+
assert(it_->GetUpdateCount() >= 1);
356+
auto seq = assigned_seqno_.lower_bound + it_->GetUpdateCount() - 1;
357+
assert(seq <= assigned_seqno_.upper_bound);
358+
return seq;
359+
}
360+
361+
// If it_ is valid, udate key_ to an internal key containing it_ current
362+
// key, CurrentKeySeqno() and a type corresponding to it_ current entry type.
358363
void UpdateKey() {
359364
valid_ = it_->Valid();
360365
if (!Valid()) {
@@ -370,16 +375,16 @@ class WBWIMemTableIterator final : public InternalIterator {
370375
std::to_string(it_->Entry().type));
371376
return;
372377
}
373-
key_buf_.SetInternalKey(it_->Entry().key, assigned_seqno_.upper_bound,
374-
t->second);
378+
key_buf_.SetInternalKey(it_->Entry().key, CurrentKeySeqno(), t->second);
375379
key_ = key_buf_.GetInternalKey();
376380
}
377381

378382
void UpdateSingleDeleteKey() {
379383
assert(it_->Valid());
380384
assert(Valid());
381-
assert(assigned_seqno_.lower_bound < assigned_seqno_.upper_bound);
382-
key_buf_.SetInternalKey(it_->Entry().key, assigned_seqno_.upper_bound - 1,
385+
// The key that overwrites this SingleDelete will be assigned at least
386+
// seqno lower_bound + 1 (see CurrentKeySeqno()).
387+
key_buf_.SetInternalKey(it_->Entry().key, assigned_seqno_.lower_bound,
383388
kTypeSingleDeletion);
384389
key_ = key_buf_.GetInternalKey();
385390
at_overwritten_single_del_ = true;

0 commit comments

Comments
 (0)