@@ -13,21 +13,27 @@ namespace ROCKSDB_NAMESPACE {
13
13
// a transaction (which is based on WBWI) into the DB as an immutable memtable.
14
14
//
15
15
// REQUIRE overwrite_key to be true for the WBWI
16
- // Since the keys in WBWI do not have sequence number, the memtable needs to be
16
+ // Since the keys in WBWI do not have sequence number, this class is responsible
17
+ // for assigning sequence numbers to the keys. This memtable needs to be
17
18
// assigned a range of sequence numbers through AssignSequenceNumbers(seqno)
18
19
// before being available for reads.
19
- // With overwrite_key = true, WBWI keeps track of the most recent update for
20
- // each key, and each such key will be assigned seqno.upper_bound during reads.
21
- // One exception is during flush, consider the following scenario:
20
+ //
21
+ // The sequence number assignment uses the update count for each key
22
+ // tracked in WBWI (see WBWIIterator::GetUpdateCount()). For each key, the
23
+ // sequence number assigned is seqno.lower_bound + update_count - 1. So more
24
+ // recent updates will have higher sequence number.
25
+ //
26
+ // There is a special case where this memtable needs to emit an extra
27
+ // SingleDelete even when the SD is overwritten by another update.
28
+ // Consider the following scenario:
22
29
// - WBWI has SD(k) then PUT(k, v1)
23
30
// - DB has PUT(k, v2) in L1
24
31
// - flush WBWI adds PUT(k, v1) into L0
25
32
// - live memtable contains SD(k)
26
33
// - flush live memtable and compact it with L0 will drop SD(k) and PUT(k, v1)
27
34
// - the PUT(k, v2) in L1 incorrectly becomes visible
28
35
// So during flush, iterator from this memtable will need emit overwritten
29
- // single deletion. These single deletion entries will be
30
- // assigned seqno.upper_bound - 1.
36
+ // single deletion. This SD will be assigned seqno.lower_bound.
31
37
class WBWIMemTable final : public ReadOnlyMemTable {
32
38
public:
33
39
struct SeqnoRange {
@@ -258,51 +264,40 @@ class WBWIMemTableIterator final : public InternalIterator {
258
264
}
259
265
260
266
void Seek (const Slice& target) override {
267
+ // `emit_overwritten_single_del_` is only used for flush, which does
268
+ // sequential forward scan from the beginning.
269
+ assert (!emit_overwritten_single_del_);
261
270
Slice target_user_key = ExtractUserKey (target);
271
+ // Moves to first update >= target_user_key
262
272
it_->Seek (target_user_key);
263
- if (it_->Valid ()) {
264
- // compare seqno
265
- SequenceNumber seqno = GetInternalKeySeqno (target);
266
- assert (!emit_overwritten_single_del_);
267
- // For now all keys are assigned seqno_ub_, this may change after merge
268
- // is supported.
269
- assert (seqno <= assigned_seqno_.lower_bound ||
270
- seqno >= assigned_seqno_.upper_bound );
271
- if (seqno < assigned_seqno_.upper_bound &&
272
- comparator_->Compare (it_->Entry ().key , target_user_key) == 0 ) {
273
- it_->Next ();
274
- // TODO: cannot assume distinct keys once Merge is supported
275
- if (it_->Valid ()) {
276
- assert (comparator_->Compare (it_->Entry ().key , target_user_key) > 0 );
277
- }
278
- }
273
+ SequenceNumber target_seqno = GetInternalKeySeqno (target);
274
+ // Compare seqno
275
+ while (it_->Valid () &&
276
+ comparator_->Compare (it_->Entry ().key , target_user_key) == 0 &&
277
+ target_seqno < CurrentKeySeqno ()) {
278
+ it_->Next ();
279
279
}
280
280
UpdateKey ();
281
281
}
282
282
283
283
void SeekForPrev (const Slice& target) override {
284
+ assert (!emit_overwritten_single_del_);
284
285
Slice target_user_key = ExtractUserKey (target);
286
+ // Moves to last update <= target_user_key
285
287
it_->SeekForPrev (target_user_key);
286
- if (it_->Valid ()) {
287
- SequenceNumber seqno = GetInternalKeySeqno (target);
288
- assert (seqno <= assigned_seqno_.lower_bound ||
289
- seqno >= assigned_seqno_.upper_bound );
290
- if (seqno > assigned_seqno_.upper_bound &&
291
- comparator_->Compare (it_->Entry ().key , target_user_key) == 0 ) {
292
- it_->Prev ();
293
- if (it_->Valid ()) {
294
- // TODO: cannot assume distinct keys once Merge is supported
295
- assert (comparator_->Compare (it_->Entry ().key , target_user_key) < 0 );
296
- }
297
- }
288
+ SequenceNumber target_seqno = GetInternalKeySeqno (target);
289
+ // Need to move to the first entry with seqno >= target_seqno for the same
290
+ // user key or a different user key.
291
+ // Compare seqno
292
+ while (it_->Valid () &&
293
+ comparator_->Compare (it_->Entry ().key , target_user_key) == 0 &&
294
+ CurrentKeySeqno () < target_seqno) {
295
+ it_->Prev ();
298
296
}
299
297
UpdateKey ();
300
298
}
301
299
302
300
void Next () override {
303
- // Only need to emit single deletion during flush. Since Flush does
304
- // sequential forward scan, we only need to emit single deletion in Next(),
305
- // and do not need to consider iterator direction change.
306
301
assert (Valid ());
307
302
if (emit_overwritten_single_del_) {
308
303
if (it_->HasOverWrittenSingleDel () && !at_overwritten_single_del_) {
@@ -329,6 +324,7 @@ class WBWIMemTableIterator final : public InternalIterator {
329
324
}
330
325
331
326
void Prev () override {
327
+ assert (!emit_overwritten_single_del_);
332
328
assert (Valid ());
333
329
it_->Prev ();
334
330
UpdateKey ();
@@ -341,7 +337,6 @@ class WBWIMemTableIterator final : public InternalIterator {
341
337
342
338
Slice value () const override {
343
339
assert (Valid ());
344
- // TODO: it_->Entry() is not trivial, cache it
345
340
return it_->Entry ().value ;
346
341
}
347
342
@@ -355,6 +350,16 @@ class WBWIMemTableIterator final : public InternalIterator {
355
350
private:
356
351
static const std::unordered_map<WriteType, ValueType> WriteTypeToValueTypeMap;
357
352
353
+ SequenceNumber CurrentKeySeqno () {
354
+ assert (it_->Valid ());
355
+ assert (it_->GetUpdateCount () >= 1 );
356
+ auto seq = assigned_seqno_.lower_bound + it_->GetUpdateCount () - 1 ;
357
+ assert (seq <= assigned_seqno_.upper_bound );
358
+ return seq;
359
+ }
360
+
361
+ // If it_ is valid, udate key_ to an internal key containing it_ current
362
+ // key, CurrentKeySeqno() and a type corresponding to it_ current entry type.
358
363
void UpdateKey () {
359
364
valid_ = it_->Valid ();
360
365
if (!Valid ()) {
@@ -370,16 +375,16 @@ class WBWIMemTableIterator final : public InternalIterator {
370
375
std::to_string (it_->Entry ().type ));
371
376
return ;
372
377
}
373
- key_buf_.SetInternalKey (it_->Entry ().key , assigned_seqno_.upper_bound ,
374
- t->second );
378
+ key_buf_.SetInternalKey (it_->Entry ().key , CurrentKeySeqno (), t->second );
375
379
key_ = key_buf_.GetInternalKey ();
376
380
}
377
381
378
382
void UpdateSingleDeleteKey () {
379
383
assert (it_->Valid ());
380
384
assert (Valid ());
381
- assert (assigned_seqno_.lower_bound < assigned_seqno_.upper_bound );
382
- key_buf_.SetInternalKey (it_->Entry ().key , assigned_seqno_.upper_bound - 1 ,
385
+ // The key that overwrites this SingleDelete will be assigned at least
386
+ // seqno lower_bound + 1 (see CurrentKeySeqno()).
387
+ key_buf_.SetInternalKey (it_->Entry ().key , assigned_seqno_.lower_bound ,
383
388
kTypeSingleDeletion );
384
389
key_ = key_buf_.GetInternalKey ();
385
390
at_overwritten_single_del_ = true ;
0 commit comments