@@ -78,6 +78,7 @@ const (
7878 commitFolderPrefix = "commit"
7979
8080 historyFolder = "history" // history data is snapshot-agnostic / compaction-agnostic i.e. history(t) = history(compact(t))
81+ timestampFile = "TIMESTAMP"
8182)
8283
8384// initial and final nLog size, root node size, nLog digest since initial and final points
@@ -175,9 +176,9 @@ type TBtree struct {
175176 cache * cache.Cache
176177 nmutex sync.Mutex // mutex for cache and file reading
177178
178- hLog appendable.Appendable
179-
180- cLog appendable. Appendable
179+ hLog appendable.Appendable
180+ cLog appendable. Appendable
181+ tsFile string
181182
182183 root node
183184
@@ -240,6 +241,7 @@ type node interface {
240241 minKey () []byte
241242 ts () uint64
242243 setTs (ts uint64 ) (node , error )
244+ tsMutated () bool
243245 size () (int , error )
244246 mutated () bool
245247 offset () int64 // only valid when !mutated()
@@ -364,6 +366,7 @@ func Open(path string, opts *Options) (*TBtree, error) {
364366
365367 nFolder := snapFolder (nodesFolderPrefix , snapID )
366368 cFolder := snapFolder (commitFolderPrefix , snapID )
369+ tsFile := snapFolder (timestampFile , snapID )
367370
368371 snapPath := filepath .Join (path , cFolder )
369372
@@ -396,7 +399,7 @@ func Open(path string, opts *Options) (*TBtree, error) {
396399 }
397400 if err == nil && ! discardSnapshotsFolder {
398401 // TODO: semantic validation and further amendment procedures may be done instead of a full initialization
399- t , err = OpenWith (path , nLog , hLog , cLog , opts )
402+ t , err = OpenWith (path , tsFile , nLog , hLog , cLog , opts )
400403 }
401404 if err != nil {
402405 opts .logger .Infof ("skipping snapshots at '%s', opening btree returned: %v" , snapPath , err )
@@ -446,15 +449,13 @@ func Open(path string, opts *Options) (*TBtree, error) {
446449 if err != nil {
447450 return nil , err
448451 }
449-
450- return OpenWith (path , nLog , hLog , cLog , opts )
452+ return OpenWith (path , timestampFile , nLog , hLog , cLog , opts )
451453}
452454
453455func snapFolder (folder string , snapID uint64 ) string {
454456 if snapID == 0 {
455457 return folder
456458 }
457-
458459 return fmt .Sprintf ("%s%016d" , folder , snapID )
459460}
460461
@@ -488,8 +489,9 @@ func discardSnapshots(path string, snapIDs []uint64, appRemove AppRemoveFunc, lo
488489 for _ , snapID := range snapIDs {
489490 nFolder := snapFolder (nodesFolderPrefix , snapID )
490491 cFolder := snapFolder (commitFolderPrefix , snapID )
492+ tsFile := snapFolder (timestampFile , snapID )
491493
492- logger .Infof ("discarding snapshot with id=%d at '%s'..., %d " , snapID , path )
494+ logger .Infof ("discarding snapshot with id=%d at '%s'..." , snapID , path )
493495
494496 err := appRemove (path , nFolder )
495497 if err != nil {
@@ -501,13 +503,15 @@ func discardSnapshots(path string, snapIDs []uint64, appRemove AppRemoveFunc, lo
501503 return err
502504 }
503505
506+ _ = os .Remove (filepath .Join (path , tsFile ))
507+
504508 logger .Infof ("snapshot with id=%d at '%s' has been discarded, %d" , snapID , path )
505509 }
506510
507511 return nil
508512}
509513
510- func OpenWith (path string , nLog , hLog , cLog appendable.Appendable , opts * Options ) (* TBtree , error ) {
514+ func OpenWith (path , tsFile string , nLog , hLog , cLog appendable.Appendable , opts * Options ) (* TBtree , error ) {
511515 if nLog == nil || hLog == nil || cLog == nil {
512516 return nil , ErrIllegalArguments
513517 }
@@ -575,6 +579,7 @@ func OpenWith(path string, nLog, hLog, cLog appendable.Appendable, opts *Options
575579 nLog : nLog ,
576580 hLog : hLog ,
577581 cLog : cLog ,
582+ tsFile : tsFile ,
578583 cache : nodeCache ,
579584 maxNodeSize : maxNodeSize ,
580585 maxKeySize : maxKeySize ,
@@ -694,6 +699,13 @@ func OpenWith(path string, nLog, hLog, cLog appendable.Appendable, opts *Options
694699
695700 opts .logger .Infof ("index '%s' {ts=%d, discarded_snapshots=%d} successfully loaded" , path , t .Ts (), discardedCLogEntries )
696701
702+ if ts := t .readTsFile (); ts > t .root .ts () {
703+ root , err := t .root .setTs (ts )
704+ if err != nil {
705+ return nil , err
706+ }
707+ t .root = root
708+ }
697709 return t , nil
698710}
699711
@@ -825,7 +837,6 @@ func (t *TBtree) readNodeFrom(r *appendable.Reader) (node, error) {
825837 n .off = off
826838 return n , nil
827839 }
828-
829840 return nil , ErrReadingFileContent
830841}
831842
@@ -857,7 +868,6 @@ func (t *TBtree) readInnerNodeFrom(r *appendable.Reader) (*innerNode, error) {
857868 n ._minOff = nref ._minOff
858869 }
859870 }
860-
861871 return n , nil
862872}
863873
@@ -959,7 +969,6 @@ func (t *TBtree) readLeafNodeFrom(r *appendable.Reader) (*leafNode, error) {
959969 l ._ts = ts
960970 }
961971 }
962-
963972 return l , nil
964973}
965974
@@ -1158,7 +1167,6 @@ func (t *TBtree) flushTree(cleanupPercentageHint float32, forceSync bool, forceC
11581167 }
11591168
11601169 sync := forceSync || t .insertionCountSinceSync >= t .syncThld
1161-
11621170 if sync {
11631171 err = t .hLog .Sync ()
11641172 if err != nil {
@@ -1293,6 +1301,54 @@ func (t *TBtree) flushTree(cleanupPercentageHint float32, forceSync bool, forceC
12931301 return wN , wH , nil
12941302}
12951303
1304+ func (t * TBtree ) readTsFile () uint64 {
1305+ path := filepath .Join (t .path , t .tsFile )
1306+
1307+ bs , err := os .ReadFile (path )
1308+ if err != nil {
1309+ return 0
1310+ }
1311+ return binary .BigEndian .Uint64 (bs )
1312+ }
1313+
1314+ // The timestamp (ts) file is essential for optimizing crash recovery and restart times.
1315+ // Initially, the B-Tree design only persisted timestamp information directly with inserted key-value entries.
1316+ // However, when the tree's logical timestamp is advanced (e.g., via `SetTs()`), this crucial
1317+ // 'high-water mark' is not automatically written to disk.
1318+ //
1319+ // Consequently, without this dedicated timestamp file, a system restart or crash would
1320+ // necessitate re-scanning entire segments of the transaction log. This can be
1321+ // extremely time-consuming, especially if long segments of the log have been processed
1322+ // but haven't resulted in new key insertions that trigger a tree flush, leading to
1323+ // inefficient recovery and prolonged downtime.
1324+ func (t * TBtree ) writeTsFile () error {
1325+ return writeTsFile (t .path , t .tsFile , t .root .ts ())
1326+ }
1327+
1328+ func writeTsFile (path , name string , ts uint64 ) error {
1329+ tempFileName , err := func () (string , error ) {
1330+ tempFile , err := os .CreateTemp (path , "" )
1331+ if err != nil {
1332+ return "" , err
1333+ }
1334+ defer tempFile .Close ()
1335+
1336+ var buf [8 ]byte
1337+ binary .BigEndian .PutUint64 (buf [:], ts )
1338+ _ , err = tempFile .Write (buf [:])
1339+ if err != nil {
1340+ return "" , err
1341+ }
1342+
1343+ err = tempFile .Sync ()
1344+ return tempFile .Name (), err
1345+ }()
1346+ if err != nil {
1347+ return err
1348+ }
1349+ return os .Rename (tempFileName , filepath .Join (path , name ))
1350+ }
1351+
12961352// SnapshotCount returns the number of stored snapshots
12971353// Note: snapshotCount(compact(t)) = 1
12981354func (t * TBtree ) SnapshotCount () (uint64 , error ) {
@@ -1464,7 +1520,14 @@ func (t *TBtree) fullDump(snap *Snapshot, progressOutput writeProgressOutputFunc
14641520 cLog .Close ()
14651521 }()
14661522
1467- return t .fullDumpTo (snap , nLog , cLog , progressOutput )
1523+ err = t .fullDumpTo (snap , nLog , cLog , progressOutput )
1524+ if err == nil {
1525+ tsFile := snapFolder (timestampFile , snap .Ts ())
1526+ if err := writeTsFile (t .path , tsFile , snap .Ts ()); err != nil {
1527+ t .logger .Errorf ("%s: unable to write ts file at path %s" , err , t .path )
1528+ }
1529+ }
1530+ return err
14681531}
14691532
14701533func (t * TBtree ) fullDumpTo (snapshot * Snapshot , nLog , cLog appendable.Appendable , progressOutput writeProgressOutputFunc ) error {
@@ -1561,6 +1624,12 @@ func (t *TBtree) Close() error {
15611624
15621625 t .closed = true
15631626
1627+ if t .root .tsMutated () {
1628+ if err := t .writeTsFile (); err != nil {
1629+ return err
1630+ }
1631+ }
1632+
15641633 merrors := multierr .NewMultiErr ()
15651634
15661635 _ , _ , err := t .flushTree (0 , true , false , "close" )
@@ -1607,7 +1676,6 @@ func (t *TBtree) IncreaseTs(ts uint64) error {
16071676 _ , _ , err := t .flushTree (t .cleanupPercentage , false , false , "increaseTs" )
16081677 return err
16091678 }
1610-
16111679 return nil
16121680}
16131681
@@ -2097,6 +2165,15 @@ func (n *innerNode) size() (int, error) {
20972165 return size , nil
20982166}
20992167
2168+ func (l * innerNode ) tsMutated () bool {
2169+ for _ , nd := range l .nodes {
2170+ if nd .ts () >= l .ts () {
2171+ return false
2172+ }
2173+ }
2174+ return true
2175+ }
2176+
21002177func (n * innerNode ) mutated () bool {
21012178 return n .mut
21022179}
@@ -2265,6 +2342,10 @@ func (r *nodeRef) size() (int, error) {
22652342 return n .size ()
22662343}
22672344
2345+ func (r * nodeRef ) tsMutated () bool {
2346+ return false
2347+ }
2348+
22682349func (r * nodeRef ) mutated () bool {
22692350 return false
22702351}
@@ -2600,6 +2681,15 @@ func (l *leafNode) size() (int, error) {
26002681 return size , nil
26012682}
26022683
2684+ func (l * leafNode ) tsMutated () bool {
2685+ for _ , v := range l .values {
2686+ if v .timedValues [0 ].Ts >= l .ts () {
2687+ return false
2688+ }
2689+ }
2690+ return true
2691+ }
2692+
26032693func (l * leafNode ) mutated () bool {
26042694 return l .mut
26052695}
0 commit comments