Skip to content

Commit 63c92d0

Browse files
committed
Use immutable binary tree for index
Signed-off-by: Marek Siarkowicz <[email protected]>
1 parent e26043f commit 63c92d0

File tree

4 files changed

+574
-608
lines changed

4 files changed

+574
-608
lines changed

server/storage/mvcc/index.go

+105-136
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
package mvcc
1616

1717
import (
18-
"sync"
19-
18+
"bytes"
19+
"fmt"
2020
"github.com/google/btree"
2121
"go.uber.org/zap"
2222
)
@@ -37,96 +37,110 @@ type index interface {
3737
}
3838

3939
type treeIndex struct {
40-
sync.RWMutex
41-
tree *btree.BTreeG[*keyIndex]
42-
lg *zap.Logger
40+
baseRev int64
41+
revisionTree []*btree.BTreeG[keyRev]
42+
lg *zap.Logger
4343
}
4444

45-
func newTreeIndex(lg *zap.Logger) index {
46-
return &treeIndex{
47-
tree: btree.NewG(32, func(aki *keyIndex, bki *keyIndex) bool {
48-
return aki.Less(bki)
49-
}),
50-
lg: lg,
51-
}
45+
func (ti *treeIndex) Equal(b index) bool {
46+
//TODO implement me
47+
panic("implement me")
5248
}
5349

54-
func (ti *treeIndex) Put(key []byte, rev Revision) {
55-
keyi := &keyIndex{key: key}
56-
57-
ti.Lock()
58-
defer ti.Unlock()
59-
okeyi, ok := ti.tree.Get(keyi)
60-
if !ok {
61-
keyi.put(ti.lg, rev.Main, rev.Sub)
62-
ti.tree.ReplaceOrInsert(keyi)
63-
return
64-
}
65-
okeyi.put(ti.lg, rev.Main, rev.Sub)
50+
func (ti *treeIndex) Insert(ki *keyIndex) {
51+
//TODO implement me
52+
panic("implement me")
6653
}
6754

68-
func (ti *treeIndex) Get(key []byte, atRev int64) (modified, created Revision, ver int64, err error) {
69-
ti.RLock()
70-
defer ti.RUnlock()
71-
return ti.unsafeGet(key, atRev)
55+
func (ti *treeIndex) KeyIndex(ki *keyIndex) *keyIndex {
56+
//TODO implement me
57+
panic("implement me")
7258
}
7359

74-
func (ti *treeIndex) unsafeGet(key []byte, atRev int64) (modified, created Revision, ver int64, err error) {
75-
keyi := &keyIndex{key: key}
76-
if keyi = ti.keyIndex(keyi); keyi == nil {
77-
return Revision{}, Revision{}, 0, ErrRevisionNotFound
78-
}
79-
return keyi.get(ti.lg, atRev)
60+
type keyRev struct {
61+
key []byte
62+
mod, created Revision
63+
version int64
8064
}
8165

82-
func (ti *treeIndex) KeyIndex(keyi *keyIndex) *keyIndex {
83-
ti.RLock()
84-
defer ti.RUnlock()
85-
return ti.keyIndex(keyi)
66+
var lessThen btree.LessFunc[keyRev] = func(k keyRev, k2 keyRev) bool {
67+
return compare(k, k2) == -1
8668
}
8769

88-
func (ti *treeIndex) keyIndex(keyi *keyIndex) *keyIndex {
89-
if ki, ok := ti.tree.Get(keyi); ok {
90-
return ki
91-
}
92-
return nil
70+
func compare(k keyRev, k2 keyRev) int {
71+
return bytes.Compare(k.key, k2.key)
9372
}
9473

95-
func (ti *treeIndex) unsafeVisit(key, end []byte, f func(ki *keyIndex) bool) {
96-
keyi, endi := &keyIndex{key: key}, &keyIndex{key: end}
74+
func newTreeIndex(lg *zap.Logger) index {
75+
return &treeIndex{
76+
baseRev: -1,
77+
lg: lg,
78+
}
79+
}
9780

98-
ti.tree.AscendGreaterOrEqual(keyi, func(item *keyIndex) bool {
99-
if len(endi.key) > 0 && !item.Less(endi) {
100-
return false
101-
}
102-
if !f(item) {
103-
return false
81+
func (ti *treeIndex) Put(key []byte, rev Revision) {
82+
if ti.baseRev == -1 {
83+
ti.baseRev = rev.Main - 1
84+
ti.revisionTree = []*btree.BTreeG[keyRev]{
85+
btree.NewG[keyRev](32, lessThen),
10486
}
105-
return true
87+
}
88+
if rev.Main != ti.rev()+1 {
89+
panic(fmt.Sprintf("append only, lastRev: %d, putRev: %d", ti.rev(), rev.Main))
90+
}
91+
prevTree := ti.revisionTree[len(ti.revisionTree)-1]
92+
item, found := prevTree.Get(keyRev{key: key})
93+
created := rev
94+
var version int64 = 1
95+
if found {
96+
created = item.created
97+
version = item.version + 1
98+
}
99+
newTree := prevTree.Clone()
100+
newTree.ReplaceOrInsert(keyRev{
101+
key: key,
102+
mod: rev,
103+
created: created,
104+
version: version,
106105
})
106+
ti.revisionTree = append(ti.revisionTree, newTree)
107+
}
108+
109+
func (ti *treeIndex) rev() int64 {
110+
return ti.baseRev + int64(len(ti.revisionTree)) - 1
111+
}
112+
113+
func (ti *treeIndex) Get(key []byte, atRev int64) (modified, created Revision, ver int64, err error) {
114+
idx := atRev - ti.baseRev
115+
if idx < 0 || idx >= int64(len(ti.revisionTree)) {
116+
return Revision{}, Revision{}, 0, ErrRevisionNotFound
117+
}
118+
tree := ti.revisionTree[idx]
119+
keyRev, found := tree.Get(keyRev{key: key})
120+
if !found {
121+
return Revision{}, Revision{}, 0, ErrRevisionNotFound
122+
}
123+
return keyRev.mod, keyRev.created, keyRev.version, nil
107124
}
108125

109126
// Revisions returns limited number of revisions from key(included) to end(excluded)
110127
// at the given rev. The returned slice is sorted in the order of key. There is no limit if limit <= 0.
111128
// The second return parameter isn't capped by the limit and reflects the total number of revisions.
112129
func (ti *treeIndex) Revisions(key, end []byte, atRev int64, limit int) (revs []Revision, total int) {
113-
ti.RLock()
114-
defer ti.RUnlock()
115-
116130
if end == nil {
117-
rev, _, _, err := ti.unsafeGet(key, atRev)
131+
rev, _, _, err := ti.Get(key, atRev)
118132
if err != nil {
119133
return nil, 0
120134
}
121135
return []Revision{rev}, 1
122136
}
123-
ti.unsafeVisit(key, end, func(ki *keyIndex) bool {
124-
if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
125-
if limit <= 0 || len(revs) < limit {
126-
revs = append(revs, rev)
127-
}
128-
total++
137+
idx := atRev - ti.baseRev
138+
tree := ti.revisionTree[idx]
139+
tree.AscendRange(keyRev{key: key}, keyRev{key: end}, func(kr keyRev) bool {
140+
if limit <= 0 || len(revs) < limit {
141+
revs = append(revs, kr.mod)
129142
}
143+
total++
130144
return true
131145
})
132146
return revs, total
@@ -135,119 +149,74 @@ func (ti *treeIndex) Revisions(key, end []byte, atRev int64, limit int) (revs []
135149
// CountRevisions returns the number of revisions
136150
// from key(included) to end(excluded) at the given rev.
137151
func (ti *treeIndex) CountRevisions(key, end []byte, atRev int64) int {
138-
ti.RLock()
139-
defer ti.RUnlock()
140-
141152
if end == nil {
142-
_, _, _, err := ti.unsafeGet(key, atRev)
153+
_, _, _, err := ti.Get(key, atRev)
143154
if err != nil {
144155
return 0
145156
}
146157
return 1
147158
}
159+
idx := atRev - ti.baseRev
160+
tree := ti.revisionTree[idx]
148161
total := 0
149-
ti.unsafeVisit(key, end, func(ki *keyIndex) bool {
150-
if _, _, _, err := ki.get(ti.lg, atRev); err == nil {
151-
total++
152-
}
162+
tree.AscendRange(keyRev{key: key}, keyRev{key: end}, func(kr keyRev) bool {
163+
total++
153164
return true
154165
})
155166
return total
156167
}
157168

158169
func (ti *treeIndex) Range(key, end []byte, atRev int64) (keys [][]byte, revs []Revision) {
159-
ti.RLock()
160-
defer ti.RUnlock()
161-
162170
if end == nil {
163-
rev, _, _, err := ti.unsafeGet(key, atRev)
171+
rev, _, _, err := ti.Get(key, atRev)
164172
if err != nil {
165173
return nil, nil
166174
}
167175
return [][]byte{key}, []Revision{rev}
168176
}
169-
ti.unsafeVisit(key, end, func(ki *keyIndex) bool {
170-
if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
171-
revs = append(revs, rev)
172-
keys = append(keys, ki.key)
173-
}
177+
idx := atRev - ti.baseRev
178+
tree := ti.revisionTree[idx]
179+
tree.AscendRange(keyRev{key: key}, keyRev{key: end}, func(kr keyRev) bool {
180+
revs = append(revs, kr.mod)
181+
keys = append(keys, kr.key)
174182
return true
175183
})
176184
return keys, revs
177185
}
178186

179187
func (ti *treeIndex) Tombstone(key []byte, rev Revision) error {
180-
keyi := &keyIndex{key: key}
181-
182-
ti.Lock()
183-
defer ti.Unlock()
184-
ki, ok := ti.tree.Get(keyi)
185-
if !ok {
188+
if rev.Main != ti.rev()+1 {
189+
panic(fmt.Sprintf("append only, lastRev: %d, putRev: %d", ti.rev(), rev.Main))
190+
}
191+
prevTree := ti.revisionTree[len(ti.revisionTree)-1]
192+
newTree := prevTree.Clone()
193+
_, found := prevTree.Delete(keyRev{
194+
key: key,
195+
})
196+
ti.revisionTree = append(ti.revisionTree, newTree)
197+
if !found {
186198
return ErrRevisionNotFound
187199
}
188-
189-
return ki.tombstone(ti.lg, rev.Main, rev.Sub)
200+
return nil
190201
}
191202

192203
func (ti *treeIndex) Compact(rev int64) map[Revision]struct{} {
193204
available := make(map[Revision]struct{})
194205
ti.lg.Info("compact tree index", zap.Int64("revision", rev))
195-
ti.Lock()
196-
clone := ti.tree.Clone()
197-
ti.Unlock()
198-
199-
clone.Ascend(func(keyi *keyIndex) bool {
200-
// Lock is needed here to prevent modification to the keyIndex while
201-
// compaction is going on or revision added to empty before deletion
202-
ti.Lock()
203-
keyi.compact(ti.lg, rev, available)
204-
if keyi.isEmpty() {
205-
_, ok := ti.tree.Delete(keyi)
206-
if !ok {
207-
ti.lg.Panic("failed to delete during compaction")
208-
}
209-
}
210-
ti.Unlock()
211-
return true
212-
})
206+
idx := rev - ti.baseRev
207+
ti.revisionTree = ti.revisionTree[idx:]
208+
ti.baseRev = rev
213209
return available
214210
}
215211

216212
// Keep finds all revisions to be kept for a Compaction at the given rev.
217213
func (ti *treeIndex) Keep(rev int64) map[Revision]struct{} {
218214
available := make(map[Revision]struct{})
219-
ti.RLock()
220-
defer ti.RUnlock()
221-
ti.tree.Ascend(func(keyi *keyIndex) bool {
222-
keyi.keep(rev, available)
215+
idx := rev - ti.baseRev
216+
tree := ti.revisionTree[idx]
217+
tree.Ascend(func(item keyRev) bool {
218+
available[item.mod] = struct{}{}
223219
return true
224220
})
225221
return available
226222
}
227-
228-
func (ti *treeIndex) Equal(bi index) bool {
229-
b := bi.(*treeIndex)
230-
231-
if ti.tree.Len() != b.tree.Len() {
232-
return false
233-
}
234-
235-
equal := true
236-
237-
ti.tree.Ascend(func(aki *keyIndex) bool {
238-
bki, _ := b.tree.Get(aki)
239-
if !aki.equal(bki) {
240-
equal = false
241-
return false
242-
}
243-
return true
244-
})
245-
246-
return equal
247-
}
248-
249-
func (ti *treeIndex) Insert(ki *keyIndex) {
250-
ti.Lock()
251-
defer ti.Unlock()
252-
ti.tree.ReplaceOrInsert(ki)
253-
}

server/storage/mvcc/index_bench_test.go

+9-9
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,21 @@ import (
2424
"go.uber.org/zap"
2525
)
2626

27-
func BenchmarkIndexCompactBase(b *testing.B) { benchmarkIndexCompact(b, 3, 100) }
28-
func BenchmarkIndexCompactLongKey(b *testing.B) { benchmarkIndexCompact(b, 512, 100) }
29-
func BenchmarkIndexCompactLargeKeySpace(b *testing.B) { benchmarkIndexCompact(b, 3, 100000) }
27+
//func BenchmarkIndexCompactBase(b *testing.B) { benchmarkIndexCompact(b, 3, 100) }
28+
//func BenchmarkIndexCompactLongKey(b *testing.B) { benchmarkIndexCompact(b, 512, 100) }
29+
//func BenchmarkIndexCompactLargeKeySpace(b *testing.B) { benchmarkIndexCompact(b, 3, 100000) }
3030

31-
func BenchmarkIndexKeepBase(b *testing.B) { benchmarkIndexKeep(b, 3, 100) }
32-
func BenchmarkIndexKeepLongKey(b *testing.B) { benchmarkIndexKeep(b, 512, 100) }
33-
func BenchmarkIndexKeepLargeKeySpace(b *testing.B) { benchmarkIndexKeep(b, 3, 100000) }
31+
//func BenchmarkIndexKeepBase(b *testing.B) { benchmarkIndexKeep(b, 3, 100) }
32+
//func BenchmarkIndexKeepLongKey(b *testing.B) { benchmarkIndexKeep(b, 512, 100) }
33+
//func BenchmarkIndexKeepLargeKeySpace(b *testing.B) { benchmarkIndexKeep(b, 3, 100000) }
3434

3535
func BenchmarkIndexPutBase(b *testing.B) { benchmarkIndexPut(b, 3, 100) }
3636
func BenchmarkIndexPutLongKey(b *testing.B) { benchmarkIndexPut(b, 512, 100) }
3737
func BenchmarkIndexPutLargeKeySpace(b *testing.B) { benchmarkIndexPut(b, 3, 100000) }
3838

39-
func BenchmarkIndexTombstoneBase(b *testing.B) { benchmarkIndexTombstone(b, 3, 100, 25) }
40-
func BenchmarkIndexTombstoneLongKey(b *testing.B) { benchmarkIndexTombstone(b, 512, 100, 25) }
41-
func BenchmarkIndexTombstoneLargeKeySpace(b *testing.B) { benchmarkIndexTombstone(b, 3, 100000, 25) }
39+
//func BenchmarkIndexTombstoneBase(b *testing.B) { benchmarkIndexTombstone(b, 3, 100, 25) }
40+
//func BenchmarkIndexTombstoneLongKey(b *testing.B) { benchmarkIndexTombstone(b, 512, 100, 25) }
41+
//func BenchmarkIndexTombstoneLargeKeySpace(b *testing.B) { benchmarkIndexTombstone(b, 3, 100000, 25) }
4242

4343
func BenchmarkIndexGetBase(b *testing.B) { benchmarkIndexGet(b, 3, 100, 1, 25) }
4444
func BenchmarkIndexGetRepeatedKeys(b *testing.B) { benchmarkIndexGet(b, 3, 100, 1000, 25) }

0 commit comments

Comments
 (0)