Skip to content

Commit 6fc93b9

Browse files
committed
fix: handle terms containing termSeparator
- Updated encodeTerm to escape termSeparator and termEscape characters. - Modified decodeTerm to correctly handle escaped characters and return the appropriate sub-slice. Signed-off-by: Gao Hongtao <[email protected]>
1 parent 73d06af commit 6fc93b9

File tree

5 files changed

+66
-16
lines changed

5 files changed

+66
-16
lines changed

contentcoder.go

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@ package ice
1717
import (
1818
"bytes"
1919
"encoding/binary"
20+
"errors"
2021
"io"
2122

2223
"github.com/blugelabs/ice/compress"
2324
)
2425

25-
var termSeparator byte = 0xff
26-
var termSeparatorSplitSlice = []byte{termSeparator}
26+
const minTermLenWithEscape = 2
27+
28+
var (
29+
termSeparator byte = 0xff
30+
termEscape byte = '\\'
31+
)
2732

2833
type chunkedContentCoder struct {
2934
final []byte
@@ -238,3 +243,50 @@ func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64)
238243
}
239244
return start, metaHeaders[chunk].DocDvOffset
240245
}
246+
247+
func encodeTerm(dest, src []byte) []byte {
248+
if src == nil {
249+
dest = append(dest, termSeparator)
250+
return dest
251+
}
252+
if bytes.IndexByte(src, termSeparator) < 0 && bytes.IndexByte(src, termEscape) < 0 {
253+
dest = append(dest, src...)
254+
dest = append(dest, termSeparator)
255+
return dest
256+
}
257+
for _, b := range src {
258+
if b == termSeparator || b == termEscape {
259+
dest = append(dest, termEscape)
260+
}
261+
dest = append(dest, b)
262+
}
263+
dest = append(dest, termSeparator)
264+
return dest
265+
}
266+
267+
// nolint: gocritic, nolintlint
268+
func decodeTerm(dest, src []byte) ([]byte, []byte, error) {
269+
if len(src) == 0 {
270+
return nil, nil, errors.New("empty term values")
271+
}
272+
dest = dest[:0]
273+
if src[0] == termSeparator {
274+
return dest, src[1:], nil
275+
}
276+
for len(src) > 0 {
277+
switch {
278+
case src[0] == termEscape:
279+
if len(src) < minTermLenWithEscape {
280+
return nil, nil, errors.New("invalid termEscape character")
281+
}
282+
src = src[1:]
283+
dest = append(dest, src[0])
284+
case src[0] == termSeparator:
285+
return dest, src[1:], nil
286+
default:
287+
dest = append(dest, src[0])
288+
}
289+
src = src[1:]
290+
}
291+
return nil, nil, errors.New("invalid term values")
292+
}

docvalues.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package ice
1616

1717
import (
18-
"bytes"
1918
"encoding/binary"
2019
"fmt"
2120
"math"
@@ -40,6 +39,7 @@ type docValueReader struct {
4039
curChunkHeader []metaData
4140
curChunkData []byte // compressed data cache
4241
uncompressed []byte // temp buf for decompression
42+
termBuf []byte // temp buf for term decoding
4343
}
4444

4545
func (di *docValueReader) size() int {
@@ -254,14 +254,12 @@ func (di *docValueReader) visitDocValues(docNum uint64,
254254

255255
// pick the terms for the given docNum
256256
uncompressed = uncompressed[start:end]
257-
for {
258-
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
259-
if i < 0 {
260-
break
257+
for len(uncompressed) > 0 {
258+
if di.termBuf, uncompressed, err = decodeTerm(di.termBuf, uncompressed); err != nil {
259+
return err
261260
}
262261

263-
visitor(di.field, uncompressed[0:i])
264-
uncompressed = uncompressed[i+1:]
262+
visitor(di.field, di.termBuf)
265263
}
266264

267265
return nil

new.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -841,9 +841,7 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6
841841

842842
freqNormOffset++
843843

844-
docTermMap[docNum] = append(
845-
append(docTermMap[docNum], term...),
846-
termSeparator)
844+
docTermMap[docNum] = encodeTerm(docTermMap[docNum], []byte(term))
847845
}
848846

849847
tfEncoder.Close()

new_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) (
136136
NewFakeField("name", "wow", false, false, true),
137137
NewFakeField("desc", "some thing", false, false, true),
138138
NewFakeField("tag", "cold", false, false, true),
139+
NewFakeField("number", string([]byte{0xff}), false, false, true),
139140
}
140141
doc.FakeComposite("_all", []string{"_id"})
141142

segment_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
387387
}
388388
}()
389389

390-
fields := []string{"desc", "name", "tag"}
390+
fields := []string{"desc", "name", "tag", "number"}
391391
fieldTerms := make(map[string][]string)
392392
docValueReader, err := seg.DocumentValueReader(fields)
393393
if err != nil {
@@ -401,9 +401,10 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
401401
}
402402

403403
expectedFieldTerms := map[string][]string{
404-
"name": {"wow"},
405-
"desc": {"some", "thing"},
406-
"tag": {"cold"},
404+
"name": {"wow"},
405+
"desc": {"some", "thing"},
406+
"tag": {"cold"},
407+
"number": {string([]byte{0xff})},
407408
}
408409
if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) {
409410
t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)

0 commit comments

Comments
 (0)