Skip to content

Commit c3d8eea

Browse files
committed
fix: handle terms containing termSeparator
- Updated encodeTerm to escape termSeparator and termEscape characters. - Modified decodeTerm to correctly handle escaped characters and return the appropriate sub-slice. Signed-off-by: Gao Hongtao <[email protected]>
1 parent 73d06af commit c3d8eea

File tree

5 files changed

+71
-20
lines changed

5 files changed

+71
-20
lines changed

contentcoder.go

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@ package ice
1717
import (
1818
"bytes"
1919
"encoding/binary"
20+
"errors"
2021
"io"
2122

2223
"github.com/blugelabs/ice/compress"
2324
)
2425

25-
var termSeparator byte = 0xff
26-
var termSeparatorSplitSlice = []byte{termSeparator}
26+
const minTermLenWithEscape = 2
27+
28+
var (
29+
termSeparator byte = 0xff
30+
termEscape byte = '\\'
31+
)
2732

2833
type chunkedContentCoder struct {
2934
final []byte
@@ -238,3 +243,49 @@ func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64)
238243
}
239244
return start, metaHeaders[chunk].DocDvOffset
240245
}
246+
247+
func encodeTerm(dest, src []byte) []byte {
248+
if src == nil {
249+
dest = append(dest, termSeparator)
250+
return dest
251+
}
252+
if bytes.IndexByte(src, termSeparator) < 0 && bytes.IndexByte(src, termEscape) < 0 {
253+
dest = append(dest, src...)
254+
dest = append(dest, termSeparator)
255+
return dest
256+
}
257+
for _, b := range src {
258+
if b == termSeparator || b == termEscape {
259+
dest = append(dest, termEscape)
260+
}
261+
dest = append(dest, b)
262+
}
263+
dest = append(dest, termSeparator)
264+
return dest
265+
}
266+
267+
// nolint: gocritic, nolintlint
268+
func decodeTerm(dest, src []byte) ([]byte, []byte, error) {
269+
if len(src) == 0 {
270+
return nil, nil, errors.New("empty term values")
271+
}
272+
if src[0] == termSeparator {
273+
return dest, src[1:], nil
274+
}
275+
for len(src) > 0 {
276+
switch {
277+
case src[0] == termEscape:
278+
if len(src) < minTermLenWithEscape {
279+
return nil, nil, errors.New("invalid termEscape character")
280+
}
281+
src = src[1:]
282+
dest = append(dest, src[0])
283+
case src[0] == termSeparator:
284+
return dest, src[1:], nil
285+
default:
286+
dest = append(dest, src[0])
287+
}
288+
src = src[1:]
289+
}
290+
return nil, nil, errors.New("invalid term values")
291+
}

docvalues.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
package ice
1616

1717
import (
18-
"bytes"
1918
"encoding/binary"
2019
"fmt"
2120
"math"
@@ -40,6 +39,7 @@ type docValueReader struct {
4039
curChunkHeader []metaData
4140
curChunkData []byte // compressed data cache
4241
uncompressed []byte // temp buf for decompression
42+
termBuf []byte // temp buf for term decoding
4343
}
4444

4545
func (di *docValueReader) size() int {
@@ -254,14 +254,14 @@ func (di *docValueReader) visitDocValues(docNum uint64,
254254

255255
// pick the terms for the given docNum
256256
uncompressed = uncompressed[start:end]
257-
for {
258-
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
259-
if i < 0 {
260-
break
257+
startPos := 0
258+
di.termBuf = di.termBuf[:0]
259+
for len(uncompressed) > 0 {
260+
if di.termBuf, uncompressed, err = decodeTerm(di.termBuf, uncompressed); err != nil {
261+
return err
261262
}
262-
263-
visitor(di.field, uncompressed[0:i])
264-
uncompressed = uncompressed[i+1:]
263+
visitor(di.field, di.termBuf[startPos:])
264+
startPos = len(di.termBuf)
265265
}
266266

267267
return nil

new.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -841,9 +841,7 @@ func (s *interim) writeDictsTermField(docTermMap [][]byte, dict map[string]uint6
841841

842842
freqNormOffset++
843843

844-
docTermMap[docNum] = append(
845-
append(docTermMap[docNum], term...),
846-
termSeparator)
844+
docTermMap[docNum] = encodeTerm(docTermMap[docNum], []byte(term))
847845
}
848846

849847
tfEncoder.Close()

new_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) (
136136
NewFakeField("name", "wow", false, false, true),
137137
NewFakeField("desc", "some thing", false, false, true),
138138
NewFakeField("tag", "cold", false, false, true),
139+
NewFakeField("number", string([]byte{0xff})+" "+string([]byte{'\\'}), false, false, true),
139140
}
140141
doc.FakeComposite("_all", []string{"_id"})
141142

segment_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -387,23 +387,24 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
387387
}
388388
}()
389389

390-
fields := []string{"desc", "name", "tag"}
391-
fieldTerms := make(map[string][]string)
390+
fields := []string{"desc", "name", "tag", "number"}
391+
fieldTerms := make(map[string][][]byte)
392392
docValueReader, err := seg.DocumentValueReader(fields)
393393
if err != nil {
394394
t.Fatalf("err building document value reader: %v", err)
395395
}
396396
err = docValueReader.VisitDocumentValues(0, func(field string, term []byte) {
397-
fieldTerms[field] = append(fieldTerms[field], string(term))
397+
fieldTerms[field] = append(fieldTerms[field], term)
398398
})
399399
if err != nil {
400400
t.Error(err)
401401
}
402402

403-
expectedFieldTerms := map[string][]string{
404-
"name": {"wow"},
405-
"desc": {"some", "thing"},
406-
"tag": {"cold"},
403+
expectedFieldTerms := map[string][][]byte{
404+
"name": {[]byte("wow")},
405+
"desc": {[]byte("some"), []byte("thing")},
406+
"tag": {[]byte("cold")},
407+
"number": {[]byte("\\"), []byte{0xff}},
407408
}
408409
if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) {
409410
t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)

0 commit comments

Comments
 (0)