@@ -121,19 +121,22 @@ func (e ErrCompressedFileCorrupted) Is(err error) bool {
121121
122122// Decompressor provides access to the superstrings in a file produced by a compressor
123123type Decompressor struct {
124- f * os.File
125- mmapHandle2 * [mmap .MaxMapSize ]byte // mmap handle for windows (this is used to close mmap)
126- dict * patternTable
127- posDict * posTable
128- mmapHandle1 []byte // mmap handle for unix (this is used to close mmap)
129- data []byte // slice of correct size for the decompressor to work with
130- wordsStart uint64 // Offset of whether the superstrings actually start
131- size int64
132- modTime time.Time
133- wordsCount uint64
134- emptyWordsCount uint64
135- hasMetadata bool
136- metadata []byte
124+ f * os.File
125+ mmapHandle2 * [mmap .MaxMapSize ]byte // mmap handle for windows (this is used to close mmap)
126+ dict * patternTable
127+ posDict * posTable
128+ mmapHandle1 []byte // mmap handle for unix (this is used to close mmap)
129+ data []byte // slice of correct size for the decompressor to work with
130+ wordsStart uint64 // Offset of whether the superstrings actually start
131+ size int64
132+ modTime time.Time
133+ wordsCount uint64
134+ emptyWordsCount uint64
135+ hasMetadata bool
136+ metadata []byte
137+ version uint8
138+ featureFlagBitmask FeatureFlagBitmask
139+ compPageValuesCount uint8
137140
138141 serializedDictSize uint64
139142 lenDictSize uint64 // huffman encoded lengths
@@ -231,6 +234,21 @@ func NewDecompressorWithMetadata(compressedFilePath string, hasMetadata bool) (*
231234 d .data = d .mmapHandle1 [:d .size ]
232235 defer d .MadvNormal ().DisableReadAhead () //speedup opening on slow drives
233236
237+ d .version = d .data [0 ]
238+
239+ if d .version == FileCompressionFormatV1 {
240+ // 1st byte: version,
241+ // 2nd byte: defines how exactly the file is compressed
242+ // 3rd byte (otional): exists if PageLevelCompressionEnabled flag is enabled, and defines number of values on compressed page
243+ d .featureFlagBitmask = FeatureFlagBitmask (d .data [1 ])
244+ d .data = d .data [2 :]
245+ }
246+
247+ if d .featureFlagBitmask .Has (PageLevelCompressionEnabled ) {
248+ d .compPageValuesCount = d .data [0 ]
249+ d .data = d .data [1 :]
250+ }
251+
234252 if hasMetadata {
235253 metadataLen := binary .BigEndian .Uint32 (d .data [:4 ])
236254 d .metadata = d .data [4 : 4 + metadataLen ]
@@ -362,10 +380,11 @@ func NewDecompressorWithMetadata(compressedFilePath string, hasMetadata bool) (*
362380 }
363381 d .wordsStart = pos + dictSize
364382
365- if d .Count () == 0 && dictSize == 0 && d .size > compressedMinSize {
383+ if d .Count () == 0 && dictSize == 0 && d .size > d . calcCompressedMinSize () {
366384 return nil , & ErrCompressedFileCorrupted {
367385 FileName : fName , Reason : fmt .Sprintf ("size %v but no words in it" , datasize .ByteSize (d .size ).HR ())}
368386 }
387+
369388 validationPassed = true
370389 return d , nil
371390}
@@ -466,10 +485,12 @@ func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16,
466485func (d * Decompressor ) DataHandle () unsafe.Pointer {
467486 return unsafe .Pointer (& d .data [0 ])
468487}
469- func (d * Decompressor ) SerializedDictSize () uint64 { return d .serializedDictSize }
470- func (d * Decompressor ) SerializedLenSize () uint64 { return d .lenDictSize }
471- func (d * Decompressor ) DictWords () int { return d .dictWords }
472- func (d * Decompressor ) DictLens () int { return d .dictLens }
488+ func (d * Decompressor ) SerializedDictSize () uint64 { return d .serializedDictSize }
489+ func (d * Decompressor ) SerializedLenSize () uint64 { return d .lenDictSize }
490+ func (d * Decompressor ) DictWords () int { return d .dictWords }
491+ func (d * Decompressor ) DictLens () int { return d .dictLens }
492+ func (d * Decompressor ) CompressedPageValuesCount () int { return int (d .compPageValuesCount ) }
493+ func (d * Decompressor ) CompressionFormatVersion () uint8 { return d .version }
473494
474495func (d * Decompressor ) Size () int64 {
475496 return d .size
@@ -1082,3 +1103,15 @@ func (g *Getter) BinarySearch(seek []byte, count int, getOffset func(i uint64) (
10821103 }
10831104 return foundOffset , true
10841105}
1106+
1107+ func (d * Decompressor ) calcCompressedMinSize () int64 {
1108+ if d .version == FileCompressionFormatV0 {
1109+ return compressedMinSize
1110+ }
1111+
1112+ if d .featureFlagBitmask .Has (PageLevelCompressionEnabled ) {
1113+ return compressedMinSize + 3 // 2 bytes always are used for bitmask and version + 1 optional for page level compression if enabled
1114+ }
1115+
1116+ return compressedMinSize + 2
1117+ }
0 commit comments