@@ -545,14 +545,17 @@ func (d *Decompressor) MadvWillNeed() *Decompressor {
545545// Getter represent "reader" or "iterator" that can move across the data of the decompressor
546546// The full state of the getter can be captured by saving dataP, and dataBit
547547type Getter struct {
548+ dataP uint64 // current byte offset in data
549+ dataLen uint64 // len(data), precomputed
550+ dataBit int // bit offset within current byte (0-7)
551+ posMask uint16 // cached posDict.mask, avoids pointer chain
552+ posDict * posTable // Huffman table for positions
553+ data []byte // compressed bitstream (ptr at 48, len at 56 = CL0)
554+ //less hot fields
548555 patternDict * patternTable
549- posDict * posTable
556+ d * Decompressor
550557 fName string
551- data []byte
552- dataP uint64
553- dataBit int // Value 0..7 - position of the bit
554558 trace bool
555- d * Decompressor
556559}
557560
558561func (g * Getter ) MadvNormal () MadvDisabler {
@@ -565,21 +568,26 @@ func (g *Getter) Count() int { return g.d.Count() }
565568func (g * Getter ) FileName () string { return g .fName }
566569func (g * Getter ) GetMetadata () []byte { return g .d .GetMetadata () }
567570
568- func (g * Getter ) nextPos (clean bool ) uint64 {
569- if clean && g .dataBit > 0 {
571+ // nextPosClean aligns to the next byte boundary then reads the next position.
572+ func (g * Getter ) nextPosClean () uint64 {
573+ if g .dataBit > 0 {
570574 g .dataP ++
571575 g .dataBit = 0
572576 }
573- table := g .posDict
574- if table .bitLen == 0 {
575- return table .pos [0 ]
577+ return g .nextPos ()
578+ }
579+
580+ // nextPos reads the next position from the Huffman-coded bitstream.
581+ func (g * Getter ) nextPos () uint64 {
582+ if g .posDict .bitLen == 0 {
583+ return g .posDict .pos [0 ]
576584 }
585+ table := g .posDict
577586 data := g .data
578587 dataP := g .dataP
579588 dataBit := g .dataBit
580589 dataLen := uint64 (len (data ))
581- // Precompute mask for the first table (hot path optimization)
582- mask := uint16 (1 )<< table .bitLen - 1
590+ mask := g .posMask
583591 for {
584592 // Read up to 16 bits starting at dataP, shifted by dataBit
585593 code := uint16 (data [dataP ]) >> dataBit
@@ -613,11 +621,10 @@ func (g *Getter) nextPattern() []byte {
613621 data := g .data
614622 dataP := g .dataP
615623 dataBit := g .dataBit
616- dataLen := uint64 (len (data ))
617624
618625 for {
619626 code := uint16 (data [dataP ]) >> dataBit
620- if 8 - dataBit < table .bitLen && dataP + 1 < dataLen {
627+ if 8 - dataBit < table .bitLen && dataP + 1 < g . dataLen {
621628 code |= uint16 (data [dataP + 1 ]) << (8 - dataBit )
622629 }
623630 code &= (uint16 (1 ) << table .bitLen ) - 1
@@ -650,13 +657,19 @@ func (d *Decompressor) EmptyWordsCount() int { return int(d.emptyWordsCount) }
650657// Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
651658// for the same decompressor
652659func (d * Decompressor ) MakeGetter () * Getter {
653- return & Getter {
660+ data := d .data [d .wordsStart :]
661+ g := & Getter {
654662 d : d ,
655663 posDict : d .posDict ,
656- data : d .data [d .wordsStart :],
664+ data : data ,
665+ dataLen : uint64 (len (data )),
657666 patternDict : d .dict ,
658667 fName : d .FileName (),
659668 }
669+ if d .posDict != nil {
670+ g .posMask = uint16 (1 )<< g .posDict .bitLen - 1
671+ }
672+ return g
660673}
661674
662675func (g * Getter ) DataLen () int {
@@ -669,15 +682,15 @@ func (g *Getter) Reset(offset uint64) {
669682}
670683
671684func (g * Getter ) HasNext () bool {
672- return g .dataP < uint64 ( len ( g . data ))
685+ return g .dataP < g . dataLen
673686}
674687
675688// Next extracts a compressed word from current offset in the file
676689// and appends it to the given buf, returning the result of appending
677690// After extracting next word, it moves to the beginning of the next one
678691func (g * Getter ) Next (buf []byte ) ([]byte , uint64 ) {
679692 savePos := g .dataP
680- wordLen := g .nextPos ( true )
693+ wordLen := g .nextPosClean ( )
681694 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
682695 if wordLen == 0 {
683696 if g .dataBit > 0 {
@@ -707,7 +720,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
707720 // Loop below fills in the patterns
708721 // Tracking position in buf where to insert part of the word
709722 bufPos := bufOffset
710- for pos := g .nextPos (false /* clean */ ); pos != 0 ; pos = g .nextPos (false ) {
723+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
711724 bufPos += int (pos ) - 1 // Positions where to insert patterns are encoded relative to one another
712725 pt := g .nextPattern ()
713726 copy (buf [bufPos :], pt )
@@ -719,14 +732,14 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
719732 postLoopPos := g .dataP
720733 g .dataP = savePos
721734 g .dataBit = 0
722- g .nextPos ( true /* clean */ ) // Reset the state of huffman reader
735+ g .nextPosClean ( ) // Reset the state of huffman reader
723736
724737 // Restore to the beginning of buf
725738 bufPos = bufOffset
726739 lastUncovered := bufOffset
727740
728741 // Loop below fills the data which is not in the patterns
729- for pos := g .nextPos (false ); pos != 0 ; pos = g .nextPos (false ) {
742+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
730743 bufPos += int (pos ) - 1 // Positions where to insert patterns are encoded relative to one another
731744 if bufPos > lastUncovered {
732745 dif := uint64 (bufPos - lastUncovered )
@@ -746,7 +759,7 @@ func (g *Getter) Next(buf []byte) ([]byte, uint64) {
746759}
747760
748761func (g * Getter ) NextUncompressed () ([]byte , uint64 ) {
749- wordLen := g .nextPos ( true )
762+ wordLen := g .nextPosClean ( )
750763 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
751764 if wordLen == 0 {
752765 if g .dataBit > 0 {
@@ -755,7 +768,7 @@ func (g *Getter) NextUncompressed() ([]byte, uint64) {
755768 }
756769 return g .data [g .dataP :g .dataP ], g .dataP
757770 }
758- g .nextPos (false )
771+ g .nextPos ()
759772 if g .dataBit > 0 {
760773 g .dataP ++
761774 g .dataBit = 0
@@ -767,7 +780,7 @@ func (g *Getter) NextUncompressed() ([]byte, uint64) {
767780
768781// Skip moves offset to the next word and returns the new offset and the length of the word.
769782func (g * Getter ) Skip () (uint64 , int ) {
770- l := g .nextPos ( true )
783+ l := g .nextPosClean ( )
771784 l -- // because when create huffman tree we do ++ , because 0 is terminator
772785 if l == 0 {
773786 if g .dataBit > 0 {
@@ -781,7 +794,7 @@ func (g *Getter) Skip() (uint64, int) {
781794 var add uint64
782795 var bufPos int
783796 var lastUncovered int
784- for pos := g .nextPos (false /* clean */ ); pos != 0 ; pos = g .nextPos (false ) {
797+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
785798 bufPos += int (pos ) - 1
786799 if wordLen < bufPos {
787800 panic (fmt .Sprintf ("likely .idx is invalid: %s" , g .fName ))
@@ -804,7 +817,7 @@ func (g *Getter) Skip() (uint64, int) {
804817}
805818
806819func (g * Getter ) SkipUncompressed () (uint64 , int ) {
807- wordLen := g .nextPos ( true )
820+ wordLen := g .nextPosClean ( )
808821 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
809822 if wordLen == 0 {
810823 if g .dataBit > 0 {
@@ -813,7 +826,7 @@ func (g *Getter) SkipUncompressed() (uint64, int) {
813826 }
814827 return g .dataP , 0
815828 }
816- g .nextPos (false )
829+ g .nextPos ()
817830 if g .dataBit > 0 {
818831 g .dataP ++
819832 g .dataBit = 0
@@ -826,7 +839,7 @@ func (g *Getter) SkipUncompressed() (uint64, int) {
826839func (g * Getter ) MatchPrefix (prefix []byte ) bool {
827840 savePos := g .dataP
828841
829- wordLen := g .nextPos ( true /* clean */ )
842+ wordLen := g .nextPosClean ( )
830843 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
831844 prefixLen := len (prefix )
832845 if wordLen == 0 || int (wordLen ) < prefixLen {
@@ -837,7 +850,7 @@ func (g *Getter) MatchPrefix(prefix []byte) bool {
837850 var bufPos int
838851 // In the first pass, we only check patterns
839852 // Only run this loop as far as the prefix goes, there is no need to check further
840- for pos := g .nextPos (false /* clean */ ); pos != 0 ; pos = g .nextPos (false ) {
853+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
841854 bufPos += int (pos ) - 1
842855 pattern := g .nextPattern ()
843856 var comparisonLen int
@@ -860,11 +873,11 @@ func (g *Getter) MatchPrefix(prefix []byte) bool {
860873 }
861874 postLoopPos := g .dataP
862875 g .dataP , g .dataBit = savePos , 0
863- g .nextPos ( true /* clean */ ) // Reset the state of huffman decoder
876+ g .nextPosClean ( ) // Reset the state of huffman decoder
864877 // Second pass - we check spaces not covered by the patterns
865878 var lastUncovered int
866879 bufPos = 0
867- for pos := g .nextPos (false /* clean */ ); pos != 0 && lastUncovered < prefixLen ; pos = g .nextPos (false ) {
880+ for pos := g .nextPos (); pos != 0 && lastUncovered < prefixLen ; pos = g .nextPos () {
868881 bufPos += int (pos ) - 1
869882 if bufPos > lastUncovered {
870883 dif := uint64 (bufPos - lastUncovered )
@@ -903,7 +916,7 @@ func (g *Getter) MatchPrefix(prefix []byte) bool {
903916// returns 0 if buf == word, -1 if buf < word, 1 if buf > word
904917func (g * Getter ) MatchCmp (buf []byte ) int {
905918 savePos := g .dataP
906- wordLen := g .nextPos ( true )
919+ wordLen := g .nextPosClean ( )
907920 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
908921 lenBuf := len (buf )
909922 if wordLen == 0 && lenBuf != 0 {
@@ -921,7 +934,7 @@ func (g *Getter) MatchCmp(buf []byte) int {
921934 decoded := make ([]byte , wordLen )
922935 var bufPos int
923936 // In the first pass, we only check patterns
924- for pos := g .nextPos (false /* clean */ ); pos != 0 ; pos = g .nextPos (false ) {
937+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
925938 bufPos += int (pos ) - 1
926939 pattern := g .nextPattern ()
927940 copy (decoded [bufPos :], pattern )
@@ -932,11 +945,11 @@ func (g *Getter) MatchCmp(buf []byte) int {
932945 }
933946 postLoopPos := g .dataP
934947 g .dataP , g .dataBit = savePos , 0
935- g .nextPos ( true /* clean */ ) // Reset the state of huffman decoder
948+ g .nextPosClean ( ) // Reset the state of huffman decoder
936949 // Second pass - we check spaces not covered by the patterns
937950 var lastUncovered int
938951 bufPos = 0
939- for pos := g .nextPos (false /* clean */ ); pos != 0 ; pos = g .nextPos (false ) {
952+ for pos := g .nextPos (); pos != 0 ; pos = g .nextPos () {
940953 bufPos += int (pos ) - 1
941954 // fmt.Printf("BUF POS: %d, POS: %d, lastUncovered: %d\n", bufPos, pos, lastUncovered)
942955 if bufPos > lastUncovered {
@@ -967,7 +980,7 @@ func (g *Getter) MatchPrefixUncompressed(prefix []byte) bool {
967980 g .dataP , g .dataBit = savePos , 0
968981 }()
969982
970- wordLen := g .nextPos ( true /* clean */ )
983+ wordLen := g .nextPosClean ( )
971984 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
972985 prefixLen := len (prefix )
973986 if wordLen == 0 && prefixLen != 0 {
@@ -977,7 +990,7 @@ func (g *Getter) MatchPrefixUncompressed(prefix []byte) bool {
977990 return false
978991 }
979992
980- g .nextPos ( true )
993+ g .nextPosClean ( )
981994
982995 return bytes .HasPrefix (g .data [g .dataP :g .dataP + wordLen ], prefix )
983996}
@@ -988,7 +1001,7 @@ func (g *Getter) MatchCmpUncompressed(buf []byte) int {
9881001 g .dataP , g .dataBit = savePos , 0
9891002 }()
9901003
991- wordLen := g .nextPos ( true /* clean */ )
1004+ wordLen := g .nextPosClean ( )
9921005 wordLen -- // because when create huffman tree we do ++ , because 0 is terminator
9931006 bufLen := len (buf )
9941007 if wordLen == 0 && bufLen != 0 {
@@ -998,7 +1011,7 @@ func (g *Getter) MatchCmpUncompressed(buf []byte) int {
9981011 return - 1
9991012 }
10001013
1001- g .nextPos ( true )
1014+ g .nextPosClean ( )
10021015
10031016 return bytes .Compare (buf , g .data [g .dataP :g .dataP + wordLen ])
10041017}
0 commit comments