Skip to content

Commit e446c38

Browse files
committed
[PLAT-411] array contains filter on unnest dimensions (#8897)
* array contains filter on unnest dimensions * review comment * lint
1 parent 1730638 commit e446c38

File tree

4 files changed

+713
-0
lines changed

4 files changed

+713
-0
lines changed

runtime/drivers/olap.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,20 @@ func (d Dialect) UnnestSQLSuffix(tbl string) string {
420420
return fmt.Sprintf(", %s", tbl)
421421
}
422422

423+
func (d Dialect) RequiresArrayContainsForInOperator() bool {
424+
return d == DialectDuckDB || d == DialectClickHouse
425+
}
426+
427+
func (d Dialect) GetArrayContainsFunction() string {
428+
if d == DialectDuckDB {
429+
return "list_has_any"
430+
}
431+
if d == DialectClickHouse {
432+
return "hasAny"
433+
}
434+
panic(fmt.Sprintf("unsupported dialect %q for array contains function", d))
435+
}
436+
423437
func (d Dialect) MetricsViewDimensionExpression(dimension *runtimev1.MetricsViewSpec_Dimension) (string, error) {
424438
if dimension.LookupTable != "" {
425439
var keyExpr string

runtime/metricsview/astexpr.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,12 @@ func (b *sqlExprBuilder) writeBinaryCondition(exprs []*Expression, op Operator)
275275
return b.writeBinaryConditionInner(nil, right, leftExpr, op)
276276
}
277277

278+
// For IN/NIN on unnest dimensions backed by DuckDB or ClickHouse, use native array-contains
279+
// functions (list_has_any / hasAny). This avoids double-counting when a row's array contains multiple matching values.
280+
if (op == OperatorIn || op == OperatorNin) && b.ast.Dialect.RequiresArrayContainsForInOperator() {
281+
return b.writeArrayContainsCondition(leftExpr, right, op == OperatorNin)
282+
}
283+
278284
// Generate unnest join
279285
unnestTableAlias := b.ast.GenerateIdentifier()
280286
unnestFrom, tupleStyle, auto, err := b.ast.Dialect.LateralUnnest(leftExpr, unnestTableAlias, left.Name)
@@ -646,6 +652,45 @@ func (b *sqlExprBuilder) writeInConditionForValues(left *Expression, leftOverrid
646652
return nil
647653
}
648654

655+
func (b *sqlExprBuilder) writeArrayContainsCondition(leftExpr string, right *Expression, not bool) error {
656+
vals, ok := right.Value.([]any)
657+
if !ok {
658+
return fmt.Errorf("the right value must be a list of values for an array IN condition")
659+
}
660+
661+
if len(vals) == 0 {
662+
if not {
663+
b.writeString("TRUE")
664+
} else {
665+
b.writeString("FALSE")
666+
}
667+
return nil
668+
}
669+
670+
b.writeByte('(')
671+
if not {
672+
b.writeString("NOT ")
673+
}
674+
675+
b.writeString(b.ast.Dialect.GetArrayContainsFunction())
676+
b.writeByte('(')
677+
b.writeParenthesizedString(leftExpr)
678+
b.writeString(", [")
679+
// not handling NULL values separately as clickhouse hasAny function takes care of it however, duckdb ignores null values in the list_has_any function, but there is no reliable way to make it work,
680+
// but even using leftExpr IS NULL does not solve the issue as it checks for null array rather than null values in the array.
681+
for i, val := range vals {
682+
if i > 0 {
683+
b.writeByte(',')
684+
}
685+
b.writeString("?")
686+
b.args = append(b.args, val)
687+
}
688+
b.writeString("])")
689+
b.writeByte(')')
690+
691+
return nil
692+
}
693+
649694
func (b *sqlExprBuilder) writeByte(v byte) {
650695
_ = b.out.WriteByte(v)
651696
}
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
package metricsview
2+
3+
import (
4+
"testing"
5+
6+
runtimev1 "github.com/rilldata/rill/proto/gen/rill/runtime/v1"
7+
"github.com/rilldata/rill/runtime/drivers"
8+
"github.com/stretchr/testify/require"
9+
)
10+
11+
func TestArrayContainsCondition(t *testing.T) {
12+
mv := &runtimev1.MetricsViewSpec{
13+
Table: "test_table",
14+
Database: "",
15+
Connector: "",
16+
Dimensions: []*runtimev1.MetricsViewSpec_Dimension{
17+
{Name: "id", Column: "id"},
18+
{Name: "tags", Column: "tags", Unnest: true},
19+
{Name: "city", Column: "city"},
20+
},
21+
Measures: []*runtimev1.MetricsViewSpec_Measure{
22+
{Name: "count", Expression: "count(*)", Type: runtimev1.MetricsViewSpec_MEASURE_TYPE_SIMPLE},
23+
},
24+
}
25+
sec := skipMetricsViewSecurity{}
26+
27+
tests := []struct {
28+
name string
29+
dialect drivers.Dialect
30+
dims []Dimension
31+
where *Expression
32+
wantSQL string
33+
wantArgs []any
34+
}{
35+
{
36+
name: "duckdb: in on unnest dim uses list_has_any",
37+
dialect: drivers.DialectDuckDB,
38+
where: &Expression{Condition: &Condition{
39+
Operator: OperatorIn,
40+
Expressions: []*Expression{
41+
{Name: "tags"},
42+
{Value: []any{"a", "b", "c"}},
43+
},
44+
}},
45+
wantSQL: `(list_has_any(("tags"), [?,?,?]))`,
46+
wantArgs: []any{"a", "b", "c"},
47+
},
48+
{
49+
name: "duckdb: nin on unnest dim uses NOT list_has_any",
50+
dialect: drivers.DialectDuckDB,
51+
where: &Expression{Condition: &Condition{
52+
Operator: OperatorNin,
53+
Expressions: []*Expression{
54+
{Name: "tags"},
55+
{Value: []any{"a", "b"}},
56+
},
57+
}},
58+
wantSQL: `(NOT list_has_any(("tags"), [?,?]))`,
59+
wantArgs: []any{"a", "b"},
60+
},
61+
{
62+
name: "duckdb: in on unnest dim with empty list",
63+
dialect: drivers.DialectDuckDB,
64+
where: &Expression{Condition: &Condition{
65+
Operator: OperatorIn,
66+
Expressions: []*Expression{
67+
{Name: "tags"},
68+
{Value: []any{}},
69+
},
70+
}},
71+
wantSQL: `FALSE`,
72+
wantArgs: nil,
73+
},
74+
{
75+
name: "duckdb: nin on unnest dim with empty list",
76+
dialect: drivers.DialectDuckDB,
77+
where: &Expression{Condition: &Condition{
78+
Operator: OperatorNin,
79+
Expressions: []*Expression{
80+
{Name: "tags"},
81+
{Value: []any{}},
82+
},
83+
}},
84+
wantSQL: `TRUE`,
85+
wantArgs: nil,
86+
},
87+
{
88+
name: "duckdb: in on unnest dim with null value in list",
89+
dialect: drivers.DialectDuckDB,
90+
where: &Expression{Condition: &Condition{
91+
Operator: OperatorIn,
92+
Expressions: []*Expression{
93+
{Name: "tags"},
94+
{Value: []any{"a", nil, "b"}},
95+
},
96+
}},
97+
wantSQL: `(list_has_any(("tags"), [?,?,?]))`,
98+
wantArgs: []any{"a", nil, "b"},
99+
},
100+
{
101+
name: "clickhouse: in on unnest dim uses hasAny",
102+
dialect: drivers.DialectClickHouse,
103+
where: &Expression{Condition: &Condition{
104+
Operator: OperatorIn,
105+
Expressions: []*Expression{
106+
{Name: "tags"},
107+
{Value: []any{"a", "b"}},
108+
},
109+
}},
110+
wantSQL: `(hasAny(("tags"), [?,?]))`,
111+
wantArgs: []any{"a", "b"},
112+
},
113+
{
114+
name: "clickhouse: nin on unnest dim uses NOT hasAny",
115+
dialect: drivers.DialectClickHouse,
116+
where: &Expression{Condition: &Condition{
117+
Operator: OperatorNin,
118+
Expressions: []*Expression{
119+
{Name: "tags"},
120+
{Value: []any{"a", "b"}},
121+
},
122+
}},
123+
wantSQL: `(NOT hasAny(("tags"), [?,?]))`,
124+
wantArgs: []any{"a", "b"},
125+
},
126+
{
127+
name: "clickhouse: in on unnest dim with null values",
128+
dialect: drivers.DialectClickHouse,
129+
where: &Expression{Condition: &Condition{
130+
Operator: OperatorIn,
131+
Expressions: []*Expression{
132+
{Name: "tags"},
133+
{Value: []any{nil, "a"}},
134+
},
135+
}},
136+
wantSQL: `(hasAny(("tags"), [?,?]))`,
137+
wantArgs: []any{nil, "a"},
138+
},
139+
{
140+
name: "duckdb: in on non-unnest dim uses normal IN",
141+
dialect: drivers.DialectDuckDB,
142+
where: &Expression{Condition: &Condition{
143+
Operator: OperatorIn,
144+
Expressions: []*Expression{
145+
{Name: "city"},
146+
{Value: []any{"NYC", "LA"}},
147+
},
148+
}},
149+
wantSQL: `(("city") IN (?,?))`,
150+
wantArgs: []any{"NYC", "LA"},
151+
},
152+
{
153+
name: "duckdb: nin on non-unnest dim uses normal NOT IN",
154+
dialect: drivers.DialectDuckDB,
155+
where: &Expression{Condition: &Condition{
156+
Operator: OperatorNin,
157+
Expressions: []*Expression{
158+
{Name: "city"},
159+
{Value: []any{"NYC"}},
160+
},
161+
}},
162+
wantSQL: `(("city") NOT IN (?) OR ("city") IS NULL)`,
163+
wantArgs: []any{"NYC"},
164+
},
165+
{
166+
name: "duckdb: in on unnest dim nested in AND",
167+
dialect: drivers.DialectDuckDB,
168+
where: &Expression{Condition: &Condition{
169+
Operator: OperatorAnd,
170+
Expressions: []*Expression{
171+
{Condition: &Condition{
172+
Operator: OperatorIn,
173+
Expressions: []*Expression{
174+
{Name: "tags"},
175+
{Value: []any{"a", "b"}},
176+
},
177+
}},
178+
{Condition: &Condition{
179+
Operator: OperatorEq,
180+
Expressions: []*Expression{
181+
{Name: "city"},
182+
{Value: "NYC"},
183+
},
184+
}},
185+
},
186+
}},
187+
wantSQL: `((list_has_any(("tags"), [?,?])) AND (("city") = ?))`,
188+
wantArgs: []any{"a", "b", "NYC"},
189+
},
190+
{
191+
name: "duckdb: in on unnest dim already in select falls back to normal IN",
192+
dialect: drivers.DialectDuckDB,
193+
dims: []Dimension{{Name: "tags"}},
194+
where: &Expression{Condition: &Condition{
195+
Operator: OperatorIn,
196+
Expressions: []*Expression{
197+
{Name: "tags"},
198+
{Value: []any{"a", "b"}},
199+
},
200+
}},
201+
// When tags is in the query dimensions, it's already unnested via lateral join, so falls back to normal IN.
202+
// The AST qualifies the column with a table alias from the lateral unnest join.
203+
wantSQL: `(("t0"."tags") IN (?,?))`,
204+
wantArgs: []any{"a", "b"},
205+
},
206+
}
207+
208+
for _, tt := range tests {
209+
t.Run(tt.name, func(t *testing.T) {
210+
qry := &Query{
211+
MetricsView: "test",
212+
Dimensions: tt.dims,
213+
Measures: []Measure{{Name: "count"}},
214+
Where: tt.where,
215+
}
216+
217+
ast, err := NewAST(mv, sec, qry, tt.dialect)
218+
require.NoError(t, err)
219+
220+
sql, args, err := ast.SQLForExpression(tt.where, nil, false, false)
221+
require.NoError(t, err)
222+
require.Equal(t, tt.wantSQL, sql)
223+
require.Equal(t, tt.wantArgs, args)
224+
})
225+
}
226+
}

0 commit comments

Comments
 (0)