Skip to content

Commit 41a4f63

Browse files
authored
feat: stats pruning for LIKE filters (vortex-data#6049)
Fixes vortex-data#6128 There are two types of `LIKE` filters that we can prune using our zone maps: | Filter | Zone Map Predicate | |--------|--------| |`col LIKE 'exact'` | `col.min > 'exact' OR col.max < 'exact'` | | `col LIKE 'prefix%'` | `col.min >= 'prefiy' OR col.max < 'prefix'` | I extracted out the existing logic from Utf8Scalar::upper_bound into a trait so we can make sure logic is consistent across stats gen/pruning --------- Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent 1ba9c80 commit 41a4f63

6 files changed

Lines changed: 223 additions & 32 deletions

File tree

vortex-array/src/expr/exprs/like.rs

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use vortex_error::VortexResult;
99
use vortex_error::vortex_bail;
1010
use vortex_error::vortex_err;
1111
use vortex_proto::expr as pb;
12+
use vortex_scalar::StringLike;
1213

1314
use crate::ArrayRef;
1415
use crate::compute::LikeOptions;
@@ -19,9 +20,16 @@ use crate::expr::ExecutionArgs;
1920
use crate::expr::ExecutionResult;
2021
use crate::expr::ExprId;
2122
use crate::expr::Expression;
23+
use crate::expr::Literal;
24+
use crate::expr::StatsCatalog;
2225
use crate::expr::VTable;
2326
use crate::expr::VTableExt;
2427
use crate::expr::and;
28+
use crate::expr::gt;
29+
use crate::expr::gt_eq;
30+
use crate::expr::lit;
31+
use crate::expr::lt;
32+
use crate::expr::or;
2533

2634
/// Expression that performs SQL LIKE pattern matching.
2735
pub struct Like;
@@ -127,6 +135,67 @@ impl VTable for Like {
127135
fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
128136
false
129137
}
138+
139+
fn stat_falsification(
140+
&self,
141+
like_opts: &LikeOptions,
142+
expr: &Expression,
143+
catalog: &dyn StatsCatalog,
144+
) -> Option<Expression> {
145+
// Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
146+
147+
// Don't attempt to handle ilike or negated like
148+
if like_opts.negated || like_opts.case_insensitive {
149+
return None;
150+
}
151+
152+
// Extract the pattern out
153+
let pat = expr.child(1).as_::<Literal>();
154+
155+
// LIKE NULL is nonsensical, don't try to handle it
156+
let pat_str = pat.as_utf8().value()?;
157+
158+
let src = expr.child(0).clone();
159+
let src_min = src.stat_min(catalog)?;
160+
let src_max = src.stat_max(catalog)?;
161+
162+
match LikeVariant::from_str(&pat_str)? {
163+
LikeVariant::Exact(text) => {
164+
// col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact'
165+
Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
166+
}
167+
LikeVariant::Prefix(prefix) => {
168+
// col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
169+
let succ = prefix.to_string().increment().ok()?;
170+
171+
Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
172+
}
173+
}
174+
}
175+
}
176+
177+
/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
178+
#[derive(Debug, PartialEq)]
179+
enum LikeVariant<'a> {
180+
Exact(&'a str),
181+
Prefix(&'a str),
182+
}
183+
184+
impl<'a> LikeVariant<'a> {
185+
/// Parse a LIKE pattern string into its relevant variant
186+
fn from_str(string: &str) -> Option<LikeVariant<'_>> {
187+
let Some(wildcard_pos) = string.find(['%', '_']) else {
188+
return Some(LikeVariant::Exact(string));
189+
};
190+
191+
// Can't handle wildcard in the front.
192+
if wildcard_pos == 0 {
193+
return None;
194+
}
195+
196+
let prefix = &string[..wildcard_pos];
197+
Some(LikeVariant::Prefix(prefix))
198+
}
130199
}
131200

132201
pub fn like(child: Expression, pattern: Expression) -> Expression {
@@ -176,12 +245,17 @@ mod tests {
176245

177246
use crate::ToCanonical;
178247
use crate::arrays::BoolArray;
248+
use crate::expr::col;
179249
use crate::expr::exprs::get_item::get_item;
250+
use crate::expr::exprs::like::LikeVariant;
180251
use crate::expr::exprs::like::like;
181252
use crate::expr::exprs::like::not_ilike;
182253
use crate::expr::exprs::literal::lit;
183254
use crate::expr::exprs::not::not;
184255
use crate::expr::exprs::root::root;
256+
use crate::expr::ilike;
257+
use crate::expr::not_like;
258+
use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
185259

186260
#[test]
187261
fn invert_booleans() {
@@ -217,4 +291,66 @@ mod tests {
217291
let expr2 = not_ilike(root(), lit("test*"));
218292
assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
219293
}
294+
295+
#[test]
296+
fn test_like_variant() {
297+
// Supported patterns
298+
assert_eq!(
299+
LikeVariant::from_str("simple"),
300+
Some(LikeVariant::Exact("simple"))
301+
);
302+
assert_eq!(
303+
LikeVariant::from_str("prefix%"),
304+
Some(LikeVariant::Prefix("prefix"))
305+
);
306+
assert_eq!(
307+
LikeVariant::from_str("first%rest_stuff"),
308+
Some(LikeVariant::Prefix("first"))
309+
);
310+
311+
// Unsupported patterns
312+
assert_eq!(LikeVariant::from_str("%suffix"), None);
313+
assert_eq!(LikeVariant::from_str("_pattern"), None);
314+
}
315+
316+
#[test]
317+
fn test_like_pushdown() {
318+
// Test that LIKE prefix and exactness filters can be pushed down into stats filtering
319+
// at scan time.
320+
let catalog = TrackingStatsCatalog::default();
321+
322+
let pruning_expr = like(col("a"), lit("prefix%"))
323+
.stat_falsification(&catalog)
324+
.expect("LIKE stat falsification");
325+
326+
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
327+
328+
// Multiple wildcards
329+
let pruning_expr = like(col("a"), lit("pref%ix%"))
330+
.stat_falsification(&catalog)
331+
.expect("LIKE stat falsification");
332+
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
333+
334+
let pruning_expr = like(col("a"), lit("pref_ix_"))
335+
.stat_falsification(&catalog)
336+
.expect("LIKE stat falsification");
337+
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
338+
339+
// Exact match
340+
let pruning_expr = like(col("a"), lit("exactly"))
341+
.stat_falsification(&catalog)
342+
.expect("LIKE stat falsification");
343+
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
344+
345+
// Suffix search skips pushdown
346+
let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
347+
assert_eq!(pruning_expr, None);
348+
349+
// NOT LIKE, ILIKE not supported currently
350+
assert_eq!(
351+
None,
352+
not_like(col("a"), lit("a")).stat_falsification(&catalog)
353+
);
354+
assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
355+
}
220356
}

vortex-array/src/expr/exprs/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ pub(crate) mod operators;
1717
pub(crate) mod pack;
1818
pub(crate) mod root;
1919
pub(crate) mod select;
20-
2120
pub use between::*;
2221
pub use binary::*;
2322
pub use cast::*;

vortex-array/src/expr/pruning/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
mod pruning_expr;
4+
pub(crate) mod pruning_expr;
55
mod relation;
66

77
pub use pruning_expr::RequiredStats;

vortex-array/src/expr/pruning/pruning_expr.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ pub type RequiredStats = Relation<FieldPath, Stat>;
2323
// A catalog that return a stat column whenever it is required, tracking all accessed
2424
// stats and returning them later.
2525
#[derive(Default)]
26-
struct TrackingStatsCatalog {
26+
pub(crate) struct TrackingStatsCatalog {
2727
usage: RefCell<HashMap<(FieldPath, Stat), Expression>>,
2828
}
2929

@@ -37,7 +37,7 @@ impl TrackingStatsCatalog {
3737

3838
// A catalog that return a stat column if it exists in the given scope.
3939
struct ScopeStatsCatalog<'a> {
40-
any_catalog: TrackingStatsCatalog,
40+
inner: TrackingStatsCatalog,
4141
available_stats: &'a FieldPathSet,
4242
}
4343

@@ -46,7 +46,7 @@ impl StatsCatalog for ScopeStatsCatalog<'_> {
4646
let stat_path = field_path.clone().push(stat.name());
4747

4848
if self.available_stats.contains(&stat_path) {
49-
self.any_catalog.stats_ref(field_path, stat)
49+
self.inner.stats_ref(field_path, stat)
5050
} else {
5151
None
5252
}
@@ -93,15 +93,15 @@ pub fn checked_pruning_expr(
9393
available_stats: &FieldPathSet,
9494
) -> Option<(Expression, RequiredStats)> {
9595
let catalog = ScopeStatsCatalog {
96-
any_catalog: Default::default(),
96+
inner: Default::default(),
9797
available_stats,
9898
};
9999

100100
let expr = expr.stat_falsification(&catalog)?;
101101

102102
// TODO(joe): filter access by used exprs
103103
let mut relation: Relation<FieldPath, Stat> = Relation::new();
104-
for ((field_path, stat), _) in catalog.any_catalog.into_usages() {
104+
for ((field_path, stat), _) in catalog.inner.into_usages() {
105105
relation.insert(field_path, stat)
106106
}
107107

vortex-scalar/src/binary.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ impl<'a> BinaryScalar<'a> {
9595
self.value.as_ref().map(|v| v.as_ref())
9696
}
9797

98-
/// Constructs a value at most `max_length` in size that's greater than this value.
98+
/// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than
99+
/// this.
99100
///
100101
/// Returns None if constructing a greater value would overflow.
101102
pub fn upper_bound(self, max_length: usize) -> Option<Self> {

vortex-scalar/src/utf8.rs

Lines changed: 79 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,72 @@ use crate::InnerScalarValue;
2222
use crate::Scalar;
2323
use crate::ScalarValue;
2424

25+
/// Types that can hold a valid UTF-8 string.
26+
pub trait StringLike: private::Sealed + Sized {
27+
/// Replace the last codepoint in the string with the next codepoint.
28+
///
29+
/// This operation will attempt to reuse the original memory.
30+
///
31+
/// If incrementing the last char fails, or if the string is empty,
32+
/// we return an Err with the original unmodified string.
33+
fn increment(self) -> Result<Self, Self>;
34+
}
35+
36+
mod private {
37+
use vortex_buffer::BufferString;
38+
39+
use crate::StringLike;
40+
41+
pub trait Sealed {}
42+
43+
impl Sealed for String {}
44+
45+
impl StringLike for String {
46+
fn increment(mut self) -> Result<String, String> {
47+
let Some(last_char) = self.pop() else {
48+
return Ok(self);
49+
};
50+
51+
if let Some(next_char) = char::from_u32(last_char as u32 + 1) {
52+
self.push(next_char);
53+
Ok(self)
54+
} else {
55+
// Return the original string
56+
self.push(last_char);
57+
Err(self)
58+
}
59+
}
60+
}
61+
62+
impl Sealed for BufferString {}
63+
64+
impl StringLike for BufferString {
65+
#[allow(clippy::unwrap_in_result, clippy::expect_used)]
66+
fn increment(self) -> Result<BufferString, BufferString> {
67+
if self.is_empty() {
68+
return Err(self);
69+
}
70+
71+
// Chop off the last char and return it here.
72+
let (last_idx, last_char) = self.char_indices().last().expect("non-empty");
73+
if let Some(next_char) = char::from_u32(last_char as u32 + 1)
74+
&& next_char.len_utf8() == last_char.len_utf8()
75+
{
76+
// Because the next char has the same byte width as the last char, we can overwrite
77+
// the memory directly.
78+
let mut bytes = self.into_inner().into_mut();
79+
next_char.encode_utf8(&mut bytes.as_mut()[last_idx..]);
80+
81+
// SAFETY: we overwrite the last valid char with new valid char, so
82+
// the buffer continues to hold valid UTF-8 data.
83+
unsafe { Ok(BufferString::new_unchecked(bytes.freeze())) }
84+
} else {
85+
Err(self)
86+
}
87+
}
88+
}
89+
}
90+
2591
/// A scalar value representing a UTF-8 encoded string.
2692
///
2793
/// This type provides a view into a UTF-8 string scalar value, which can be either
@@ -92,7 +158,8 @@ impl<'a> Utf8Scalar<'a> {
92158
self.value.as_ref().map(|v| v.as_ref())
93159
}
94160

95-
/// Constructs a value at most `max_length` in size that's greater than this value.
161+
/// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than
162+
/// this.
96163
///
97164
/// Returns None if constructing a greater value would overflow.
98165
pub fn upper_bound(self, max_length: usize) -> Option<Self> {
@@ -102,29 +169,16 @@ impl<'a> Utf8Scalar<'a> {
102169
.rfind(|p| value.is_char_boundary(*p))
103170
.vortex_expect("Failed to find utf8 character boundary");
104171

105-
let utf8_mut = value
106-
.get(..utf8_split_pos)
107-
.vortex_expect("Slicing with existing index");
108-
109-
for (idx, original_char) in utf8_mut.char_indices().rev() {
110-
let original_len = original_char.len_utf8();
111-
if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
112-
// do not allow increasing byte width of incremented char
113-
if next_char.len_utf8() == original_len {
114-
let sliced = value.inner().slice(0..idx + original_len);
115-
drop(value);
116-
let mut result = sliced.into_mut();
117-
next_char.encode_utf8(&mut result[idx..]);
118-
return Some(Self {
119-
dtype: self.dtype,
120-
value: Some(Arc::new(unsafe {
121-
BufferString::new_unchecked(result.freeze())
122-
})),
123-
});
124-
}
125-
}
126-
}
127-
None
172+
let sliced = value.inner().slice(..utf8_split_pos);
173+
drop(value);
174+
175+
// SAFETY: we slice to a char boundary so the sliced range contains valid UTF-8.
176+
let sliced_buf = unsafe { BufferString::new_unchecked(sliced) };
177+
let incremented = sliced_buf.increment().ok()?;
178+
Some(Self {
179+
dtype: self.dtype,
180+
value: Some(Arc::new(incremented)),
181+
})
128182
} else {
129183
Some(Self {
130184
dtype: self.dtype,
@@ -382,6 +436,7 @@ mod tests {
382436
#[test]
383437
fn upper_bound_overflow() {
384438
let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable);
439+
385440
assert!(
386441
Utf8Scalar::try_from(&utf8)
387442
.vortex_expect("utf8 scalar conversion should succeed")

0 commit comments

Comments
 (0)