Skip to content

Commit ba5ad0f

Browse files
committed
fix(embedded/tbtree): save timestamp to a separate file to avoid rescanning empty indexes
Signed-off-by: Stefano Scafiti <[email protected]>
1 parent cf9a5d8 commit ba5ad0f

File tree

3 files changed

+211
-28
lines changed

3 files changed

+211
-28
lines changed

embedded/tbtree/snapshot.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,6 @@ func (l *leafNode) writeTo(nw, hw io.Writer, writeOpts *WriteOpts, buf []byte) (
509509
bi += 2
510510

511511
accH := int64(0)
512-
513512
for _, v := range l.values {
514513
timedValue := v.timedValues[0]
515514

embedded/tbtree/tbtree.go

Lines changed: 105 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ const (
7878
commitFolderPrefix = "commit"
7979

8080
historyFolder = "history" // history data is snapshot-agnostic / compaction-agnostic i.e. history(t) = history(compact(t))
81+
timestampFile = "TIMESTAMP"
8182
)
8283

8384
// initial and final nLog size, root node size, nLog digest since initial and final points
@@ -175,9 +176,9 @@ type TBtree struct {
175176
cache *cache.Cache
176177
nmutex sync.Mutex // mutex for cache and file reading
177178

178-
hLog appendable.Appendable
179-
180-
cLog appendable.Appendable
179+
hLog appendable.Appendable
180+
cLog appendable.Appendable
181+
tsFile string
181182

182183
root node
183184

@@ -240,6 +241,7 @@ type node interface {
240241
minKey() []byte
241242
ts() uint64
242243
setTs(ts uint64) (node, error)
244+
tsMutated() bool
243245
size() (int, error)
244246
mutated() bool
245247
offset() int64 // only valid when !mutated()
@@ -364,6 +366,7 @@ func Open(path string, opts *Options) (*TBtree, error) {
364366

365367
nFolder := snapFolder(nodesFolderPrefix, snapID)
366368
cFolder := snapFolder(commitFolderPrefix, snapID)
369+
tsFile := snapFolder(timestampFile, snapID)
367370

368371
snapPath := filepath.Join(path, cFolder)
369372

@@ -396,7 +399,7 @@ func Open(path string, opts *Options) (*TBtree, error) {
396399
}
397400
if err == nil && !discardSnapshotsFolder {
398401
// TODO: semantic validation and further amendment procedures may be done instead of a full initialization
399-
t, err = OpenWith(path, nLog, hLog, cLog, opts)
402+
t, err = OpenWith(path, tsFile, nLog, hLog, cLog, opts)
400403
}
401404
if err != nil {
402405
opts.logger.Infof("skipping snapshots at '%s', opening btree returned: %v", snapPath, err)
@@ -446,15 +449,13 @@ func Open(path string, opts *Options) (*TBtree, error) {
446449
if err != nil {
447450
return nil, err
448451
}
449-
450-
return OpenWith(path, nLog, hLog, cLog, opts)
452+
return OpenWith(path, timestampFile, nLog, hLog, cLog, opts)
451453
}
452454

453455
func snapFolder(folder string, snapID uint64) string {
454456
if snapID == 0 {
455457
return folder
456458
}
457-
458459
return fmt.Sprintf("%s%016d", folder, snapID)
459460
}
460461

@@ -488,8 +489,9 @@ func discardSnapshots(path string, snapIDs []uint64, appRemove AppRemoveFunc, lo
488489
for _, snapID := range snapIDs {
489490
nFolder := snapFolder(nodesFolderPrefix, snapID)
490491
cFolder := snapFolder(commitFolderPrefix, snapID)
492+
tsFile := snapFolder(timestampFile, snapID)
491493

492-
logger.Infof("discarding snapshot with id=%d at '%s'..., %d", snapID, path)
494+
logger.Infof("discarding snapshot with id=%d at '%s'...", snapID, path)
493495

494496
err := appRemove(path, nFolder)
495497
if err != nil {
@@ -501,13 +503,15 @@ func discardSnapshots(path string, snapIDs []uint64, appRemove AppRemoveFunc, lo
501503
return err
502504
}
503505

506+
_ = os.Remove(filepath.Join(path, tsFile))
507+
504508
logger.Infof("snapshot with id=%d at '%s' has been discarded, %d", snapID, path)
505509
}
506510

507511
return nil
508512
}
509513

510-
func OpenWith(path string, nLog, hLog, cLog appendable.Appendable, opts *Options) (*TBtree, error) {
514+
func OpenWith(path, tsFile string, nLog, hLog, cLog appendable.Appendable, opts *Options) (*TBtree, error) {
511515
if nLog == nil || hLog == nil || cLog == nil {
512516
return nil, ErrIllegalArguments
513517
}
@@ -575,6 +579,7 @@ func OpenWith(path string, nLog, hLog, cLog appendable.Appendable, opts *Options
575579
nLog: nLog,
576580
hLog: hLog,
577581
cLog: cLog,
582+
tsFile: tsFile,
578583
cache: nodeCache,
579584
maxNodeSize: maxNodeSize,
580585
maxKeySize: maxKeySize,
@@ -694,6 +699,13 @@ func OpenWith(path string, nLog, hLog, cLog appendable.Appendable, opts *Options
694699

695700
opts.logger.Infof("index '%s' {ts=%d, discarded_snapshots=%d} successfully loaded", path, t.Ts(), discardedCLogEntries)
696701

702+
if ts := t.readTsFile(); ts > t.root.ts() {
703+
root, err := t.root.setTs(ts)
704+
if err != nil {
705+
return nil, err
706+
}
707+
t.root = root
708+
}
697709
return t, nil
698710
}
699711

@@ -825,7 +837,6 @@ func (t *TBtree) readNodeFrom(r *appendable.Reader) (node, error) {
825837
n.off = off
826838
return n, nil
827839
}
828-
829840
return nil, ErrReadingFileContent
830841
}
831842

@@ -857,7 +868,6 @@ func (t *TBtree) readInnerNodeFrom(r *appendable.Reader) (*innerNode, error) {
857868
n._minOff = nref._minOff
858869
}
859870
}
860-
861871
return n, nil
862872
}
863873

@@ -959,7 +969,6 @@ func (t *TBtree) readLeafNodeFrom(r *appendable.Reader) (*leafNode, error) {
959969
l._ts = ts
960970
}
961971
}
962-
963972
return l, nil
964973
}
965974

@@ -1158,7 +1167,6 @@ func (t *TBtree) flushTree(cleanupPercentageHint float32, forceSync bool, forceC
11581167
}
11591168

11601169
sync := forceSync || t.insertionCountSinceSync >= t.syncThld
1161-
11621170
if sync {
11631171
err = t.hLog.Sync()
11641172
if err != nil {
@@ -1293,6 +1301,54 @@ func (t *TBtree) flushTree(cleanupPercentageHint float32, forceSync bool, forceC
12931301
return wN, wH, nil
12941302
}
12951303

1304+
func (t *TBtree) readTsFile() uint64 {
1305+
path := filepath.Join(t.path, t.tsFile)
1306+
1307+
bs, err := os.ReadFile(path)
1308+
if err != nil {
1309+
return 0
1310+
}
1311+
return binary.BigEndian.Uint64(bs)
1312+
}
1313+
1314+
// The timestamp (ts) file is essential for optimizing crash recovery and restart times.
1315+
// Initially, the B-Tree design only persisted timestamp information directly with inserted key-value entries.
1316+
// However, when the tree's logical timestamp is advanced (e.g., via `SetTs()`), this crucial
1317+
// 'high-water mark' is not automatically written to disk.
1318+
//
1319+
// Consequently, without this dedicated timestamp file, a system restart or crash would
1320+
// necessitate re-scanning entire segments of the transaction log. This can be
1321+
// extremely time-consuming, especially if long segments of the log have been processed
1322+
// but haven't resulted in new key insertions that trigger a tree flush, leading to
1323+
// inefficient recovery and prolonged downtime.
1324+
func (t *TBtree) writeTsFile() error {
1325+
return writeTsFile(t.path, t.tsFile, t.root.ts())
1326+
}
1327+
1328+
func writeTsFile(path, name string, ts uint64) error {
1329+
tempFileName, err := func() (string, error) {
1330+
tempFile, err := os.CreateTemp(path, "")
1331+
if err != nil {
1332+
return "", err
1333+
}
1334+
defer tempFile.Close()
1335+
1336+
var buf [8]byte
1337+
binary.BigEndian.PutUint64(buf[:], ts)
1338+
_, err = tempFile.Write(buf[:])
1339+
if err != nil {
1340+
return "", err
1341+
}
1342+
1343+
err = tempFile.Sync()
1344+
return tempFile.Name(), err
1345+
}()
1346+
if err != nil {
1347+
return err
1348+
}
1349+
return os.Rename(tempFileName, filepath.Join(path, name))
1350+
}
1351+
12961352
// SnapshotCount returns the number of stored snapshots
12971353
// Note: snapshotCount(compact(t)) = 1
12981354
func (t *TBtree) SnapshotCount() (uint64, error) {
@@ -1464,7 +1520,14 @@ func (t *TBtree) fullDump(snap *Snapshot, progressOutput writeProgressOutputFunc
14641520
cLog.Close()
14651521
}()
14661522

1467-
return t.fullDumpTo(snap, nLog, cLog, progressOutput)
1523+
err = t.fullDumpTo(snap, nLog, cLog, progressOutput)
1524+
if err == nil {
1525+
tsFile := snapFolder(timestampFile, snap.Ts())
1526+
if err := writeTsFile(t.path, tsFile, snap.Ts()); err != nil {
1527+
t.logger.Errorf("%s: unable to write ts file at path %s", err, t.path)
1528+
}
1529+
}
1530+
return err
14681531
}
14691532

14701533
func (t *TBtree) fullDumpTo(snapshot *Snapshot, nLog, cLog appendable.Appendable, progressOutput writeProgressOutputFunc) error {
@@ -1561,6 +1624,12 @@ func (t *TBtree) Close() error {
15611624

15621625
t.closed = true
15631626

1627+
if t.root.tsMutated() {
1628+
if err := t.writeTsFile(); err != nil {
1629+
return err
1630+
}
1631+
}
1632+
15641633
merrors := multierr.NewMultiErr()
15651634

15661635
_, _, err := t.flushTree(0, true, false, "close")
@@ -1607,7 +1676,6 @@ func (t *TBtree) IncreaseTs(ts uint64) error {
16071676
_, _, err := t.flushTree(t.cleanupPercentage, false, false, "increaseTs")
16081677
return err
16091678
}
1610-
16111679
return nil
16121680
}
16131681

@@ -2097,6 +2165,15 @@ func (n *innerNode) size() (int, error) {
20972165
return size, nil
20982166
}
20992167

2168+
func (l *innerNode) tsMutated() bool {
2169+
for _, nd := range l.nodes {
2170+
if nd.ts() >= l.ts() {
2171+
return false
2172+
}
2173+
}
2174+
return true
2175+
}
2176+
21002177
func (n *innerNode) mutated() bool {
21012178
return n.mut
21022179
}
@@ -2265,6 +2342,10 @@ func (r *nodeRef) size() (int, error) {
22652342
return n.size()
22662343
}
22672344

2345+
func (r *nodeRef) tsMutated() bool {
2346+
return false
2347+
}
2348+
22682349
func (r *nodeRef) mutated() bool {
22692350
return false
22702351
}
@@ -2600,6 +2681,15 @@ func (l *leafNode) size() (int, error) {
26002681
return size, nil
26012682
}
26022683

2684+
func (l *leafNode) tsMutated() bool {
2685+
for _, v := range l.values {
2686+
if v.timedValues[0].Ts >= l.ts() {
2687+
return false
2688+
}
2689+
}
2690+
return true
2691+
}
2692+
26032693
func (l *leafNode) mutated() bool {
26042694
return l.mut
26052695
}

0 commit comments

Comments
 (0)