Skip to content

Commit 2ee8354

Browse files
committed
feat: Filtering by the presence of a value in an array
Signed-off-by: cancaicai <[email protected]>
1 parent e300c83 commit 2ee8354

File tree

4 files changed

+378
-0
lines changed

4 files changed

+378
-0
lines changed
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
package logstorage
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
7+
"github.com/VictoriaMetrics/VictoriaLogs/lib/prefixfilter"
8+
"github.com/valyala/fastjson"
9+
)
10+
11+
// filterArrayContains matches if the JSON array in the given field contains the given value.
12+
//
13+
// Example LogsQL: `tags:array_contains("prod")`
14+
type filterArrayContains struct {
15+
fieldName string
16+
value string
17+
}
18+
19+
func (fa *filterArrayContains) String() string {
20+
return fmt.Sprintf("%sarray_contains(%s)", quoteFieldNameIfNeeded(fa.fieldName), quoteTokenIfNeeded(fa.value))
21+
}
22+
23+
func (fa *filterArrayContains) updateNeededFields(pf *prefixfilter.Filter) {
24+
pf.AddAllowFilter(fa.fieldName)
25+
}
26+
27+
func (fa *filterArrayContains) matchRow(fields []Field) bool {
28+
v := getFieldValueByName(fields, fa.fieldName)
29+
return matchArrayContains(v, fa.value)
30+
}
31+
32+
func (fa *filterArrayContains) applyToBlockResult(br *blockResult, bm *bitmap) {
33+
c := br.getColumnByName(fa.fieldName)
34+
if c.isConst {
35+
v := c.valuesEncoded[0]
36+
if !matchArrayContains(v, fa.value) {
37+
bm.resetBits()
38+
}
39+
return
40+
}
41+
if c.isTime {
42+
bm.resetBits()
43+
return
44+
}
45+
46+
switch c.valueType {
47+
case valueTypeString:
48+
values := c.getValues(br)
49+
bm.forEachSetBit(func(idx int) bool {
50+
v := values[idx]
51+
return matchArrayContains(v, fa.value)
52+
})
53+
case valueTypeDict:
54+
bb := bbPool.Get()
55+
for _, v := range c.dictValues {
56+
c := byte(0)
57+
if matchArrayContains(v, fa.value) {
58+
c = 1
59+
}
60+
bb.B = append(bb.B, c)
61+
}
62+
valuesEncoded := c.getValuesEncoded(br)
63+
bm.forEachSetBit(func(idx int) bool {
64+
n := valuesEncoded[idx][0]
65+
return bb.B[n] == 1
66+
})
67+
bbPool.Put(bb)
68+
default:
69+
bm.resetBits()
70+
}
71+
}
72+
73+
func (fa *filterArrayContains) applyToBlockSearch(bs *blockSearch, bm *bitmap) {
74+
fieldName := fa.fieldName
75+
value := fa.value
76+
77+
v := bs.getConstColumnValue(fieldName)
78+
if v != "" {
79+
if !matchArrayContains(v, value) {
80+
bm.resetBits()
81+
}
82+
return
83+
}
84+
85+
// Verify whether filter matches other columns
86+
ch := bs.getColumnHeader(fieldName)
87+
if ch == nil {
88+
// Fast path - there are no matching columns.
89+
bm.resetBits()
90+
return
91+
}
92+
93+
switch ch.valueType {
94+
case valueTypeString:
95+
matchStringByArrayContains(bs, ch, bm, value)
96+
case valueTypeDict:
97+
matchValuesDictByArrayContains(bs, ch, bm, value)
98+
default:
99+
bm.resetBits()
100+
}
101+
}
102+
103+
func matchValuesDictByArrayContains(bs *blockSearch, ch *columnHeader, bm *bitmap, value string) {
104+
bb := bbPool.Get()
105+
for _, v := range ch.valuesDict.values {
106+
c := byte(0)
107+
if matchArrayContains(v, value) {
108+
c = 1
109+
}
110+
bb.B = append(bb.B, c)
111+
}
112+
matchEncodedValuesDict(bs, ch, bm, bb.B)
113+
bbPool.Put(bb)
114+
}
115+
116+
func matchStringByArrayContains(bs *blockSearch, ch *columnHeader, bm *bitmap, value string) {
117+
visitValues(bs, ch, bm, func(v string) bool {
118+
return matchArrayContains(v, value)
119+
})
120+
}
121+
122+
func matchArrayContains(s, value string) bool {
123+
if s == "" {
124+
return false
125+
}
126+
// Fast check: if the value is not present as a substring, it definitely won't be in the array.
127+
if !strings.Contains(s, value) {
128+
return false
129+
}
130+
131+
// Fast check 2: must start with [
132+
if s[0] != '[' {
133+
return false
134+
}
135+
136+
// Use shared fastjson.ParserPool in order to avoid per-call parser allocations.
137+
p := jspp.Get()
138+
defer jspp.Put(p)
139+
v, err := p.Parse(s)
140+
if err != nil {
141+
return false
142+
}
143+
144+
// Check if it is an array
145+
a, err := v.Array()
146+
if err != nil {
147+
return false
148+
}
149+
150+
for _, elem := range a {
151+
// We only support checking against string representation of values in the array.
152+
var sElem string
153+
switch elem.Type() {
154+
case fastjson.TypeString:
155+
sElem = string(elem.GetStringBytes())
156+
case fastjson.TypeNumber:
157+
sElem = elem.String()
158+
case fastjson.TypeTrue:
159+
sElem = "true"
160+
case fastjson.TypeFalse:
161+
sElem = "false"
162+
case fastjson.TypeNull:
163+
sElem = "null"
164+
default:
165+
continue
166+
}
167+
168+
if sElem == value {
169+
return true
170+
}
171+
}
172+
173+
return false
174+
}
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
package logstorage
2+
3+
import (
4+
"testing"
5+
6+
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
7+
)
8+
9+
func TestMatchArrayContains(t *testing.T) {
10+
t.Parallel()
11+
12+
f := func(s, value string, resultExpected bool) {
13+
t.Helper()
14+
result := matchArrayContains(s, value)
15+
if result != resultExpected {
16+
t.Fatalf("unexpected result for s=%q, value=%q; got %v; want %v", s, value, result, resultExpected)
17+
}
18+
}
19+
20+
// Not an array
21+
f("", "foo", false)
22+
f("foo", "foo", false)
23+
f("{}", "foo", false)
24+
25+
// Array doesn't contain value
26+
f("[]", "foo", false)
27+
f(`["bar"]`, "foo", false)
28+
f(`["bar","baz"]`, "foo", false)
29+
f(`[1,2]`, "3", false)
30+
31+
// Array contains value
32+
f(`["foo"]`, "foo", true)
33+
f(`["bar","foo"]`, "foo", true)
34+
f(`["foo","bar"]`, "foo", true)
35+
f(`["a","foo","b"]`, "foo", true)
36+
37+
// Mixed types
38+
f(`[123]`, "123", true)
39+
f(`[true]`, "true", true)
40+
f(`["123"]`, "123", true)
41+
42+
// Tricky cases
43+
f(`["foo bar"]`, "foo", false) // partial match
44+
f(`["foobar"]`, "foo", false) // partial match
45+
f(`["foo"]`, "fo", false) // partial match
46+
47+
// Nested structures (ignored by current implementation)
48+
f(`[{"a":"b"}]`, `{"a":"b"}`, false) // nested object ignored
49+
f(`[["a"]]`, `["a"]`, false) // nested array ignored
50+
f(`[["a"], "b"]`, "b", true) // mixed with simple value
51+
}
52+
53+
func TestFilterArrayContains(t *testing.T) {
54+
t.Parallel()
55+
56+
t.Run("const-column", func(t *testing.T) {
57+
columns := []column{
58+
{
59+
name: "foo",
60+
values: []string{
61+
`["a","b"]`,
62+
`["a","b"]`,
63+
`["a","b"]`,
64+
},
65+
},
66+
}
67+
68+
// match
69+
fa := &filterArrayContains{
70+
fieldName: "foo",
71+
value: "a",
72+
}
73+
testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 2})
74+
75+
fa = &filterArrayContains{
76+
fieldName: "foo",
77+
value: "b",
78+
}
79+
testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 2})
80+
81+
// mismatch
82+
fa = &filterArrayContains{
83+
fieldName: "foo",
84+
value: "c",
85+
}
86+
testFilterMatchForColumns(t, columns, fa, "foo", nil)
87+
88+
fa = &filterArrayContains{
89+
fieldName: "non-existing-column",
90+
value: "a",
91+
}
92+
testFilterMatchForColumns(t, columns, fa, "foo", nil)
93+
})
94+
95+
t.Run("dict", func(t *testing.T) {
96+
columns := []column{
97+
{
98+
name: "foo",
99+
values: []string{
100+
"",
101+
`["a"]`,
102+
`["b"]`,
103+
`["a","b"]`,
104+
`"a"`, // not an array
105+
`[1,2]`,
106+
},
107+
},
108+
}
109+
110+
// match
111+
fa := &filterArrayContains{
112+
fieldName: "foo",
113+
value: "a",
114+
}
115+
testFilterMatchForColumns(t, columns, fa, "foo", []int{1, 3})
116+
117+
fa = &filterArrayContains{
118+
fieldName: "foo",
119+
value: "b",
120+
}
121+
testFilterMatchForColumns(t, columns, fa, "foo", []int{2, 3})
122+
123+
// mismatch
124+
fa = &filterArrayContains{
125+
fieldName: "foo",
126+
value: "c",
127+
}
128+
testFilterMatchForColumns(t, columns, fa, "foo", nil)
129+
})
130+
131+
t.Run("strings", func(t *testing.T) {
132+
columns := []column{
133+
{
134+
name: "foo",
135+
values: []string{
136+
`["apple", "banana"]`,
137+
`["orange"]`,
138+
`not array`,
139+
`["apple"]`,
140+
`[]`,
141+
},
142+
},
143+
}
144+
145+
// match
146+
fa := &filterArrayContains{
147+
fieldName: "foo",
148+
value: "apple",
149+
}
150+
testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 3})
151+
152+
// mismatch
153+
fa = &filterArrayContains{
154+
fieldName: "foo",
155+
value: "pear",
156+
}
157+
testFilterMatchForColumns(t, columns, fa, "foo", nil)
158+
})
159+
160+
// Remove the remaining data files for the test
161+
fs.MustRemoveDir(t.Name())
162+
}

lib/logstorage/parser.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2007,6 +2007,8 @@ func parseFilterGeneric(lex *lexer, fieldName string) (filter, error) {
20072007
return parseFilterContainsAll(lex, fieldName)
20082008
case lex.isKeyword("contains_any"):
20092009
return parseFilterContainsAny(lex, fieldName)
2010+
case lex.isKeyword("array_contains"):
2011+
return parseFilterArrayContains(lex, fieldName)
20102012
case lex.isKeyword("contains_common_case"):
20112013
return parseFilterContainsCommonCase(lex, fieldName)
20122014
case lex.isKeyword("eq_field"):
@@ -2311,6 +2313,16 @@ func parseFilterContainsAny(lex *lexer, fieldName string) (filter, error) {
23112313
return parseInValues(lex, fieldName, fi, &fi.values)
23122314
}
23132315

2316+
func parseFilterArrayContains(lex *lexer, fieldName string) (filter, error) {
2317+
return parseFuncArg(lex, fieldName, func(arg string) (filter, error) {
2318+
fa := &filterArrayContains{
2319+
fieldName: getCanonicalColumnName(fieldName),
2320+
value: arg,
2321+
}
2322+
return fa, nil
2323+
})
2324+
}
2325+
23142326
func parseFilterIn(lex *lexer, fieldName string) (filter, error) {
23152327
fi := &filterIn{
23162328
fieldName: getCanonicalColumnName(fieldName),
@@ -3803,6 +3815,7 @@ var reservedKeywords = func() map[string]struct{} {
38033815
// functions
38043816
"contains_all",
38053817
"contains_any",
3818+
"array_contains",
38063819
"contains_common_case",
38073820
"eq_field",
38083821
"equals_common_case",

0 commit comments

Comments
 (0)