@@ -9,6 +9,7 @@ use vortex_error::VortexResult;
99use vortex_error:: vortex_bail;
1010use vortex_error:: vortex_err;
1111use vortex_proto:: expr as pb;
12+ use vortex_scalar:: StringLike ;
1213
1314use crate :: ArrayRef ;
1415use crate :: compute:: LikeOptions ;
@@ -19,9 +20,16 @@ use crate::expr::ExecutionArgs;
1920use crate :: expr:: ExecutionResult ;
2021use crate :: expr:: ExprId ;
2122use crate :: expr:: Expression ;
23+ use crate :: expr:: Literal ;
24+ use crate :: expr:: StatsCatalog ;
2225use crate :: expr:: VTable ;
2326use crate :: expr:: VTableExt ;
2427use crate :: expr:: and;
28+ use crate :: expr:: gt;
29+ use crate :: expr:: gt_eq;
30+ use crate :: expr:: lit;
31+ use crate :: expr:: lt;
32+ use crate :: expr:: or;
2533
2634/// Expression that performs SQL LIKE pattern matching.
2735pub struct Like ;
@@ -127,6 +135,67 @@ impl VTable for Like {
127135 fn is_null_sensitive ( & self , _instance : & Self :: Options ) -> bool {
128136 false
129137 }
138+
139+ fn stat_falsification (
140+ & self ,
141+ like_opts : & LikeOptions ,
142+ expr : & Expression ,
143+ catalog : & dyn StatsCatalog ,
144+ ) -> Option < Expression > {
145+ // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
146+
147+ // Don't attempt to handle ilike or negated like
148+ if like_opts. negated || like_opts. case_insensitive {
149+ return None ;
150+ }
151+
152+ // Extract the pattern out
153+ let pat = expr. child ( 1 ) . as_ :: < Literal > ( ) ;
154+
155+ // LIKE NULL is nonsensical, don't try to handle it
156+ let pat_str = pat. as_utf8 ( ) . value ( ) ?;
157+
158+ let src = expr. child ( 0 ) . clone ( ) ;
159+ let src_min = src. stat_min ( catalog) ?;
160+ let src_max = src. stat_max ( catalog) ?;
161+
162+ match LikeVariant :: from_str ( & pat_str) ? {
163+ LikeVariant :: Exact ( text) => {
164+ // col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact'
165+ Some ( or ( gt ( src_min, lit ( text) ) , lt ( src_max, lit ( text) ) ) )
166+ }
167+ LikeVariant :: Prefix ( prefix) => {
168+ // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
169+ let succ = prefix. to_string ( ) . increment ( ) . ok ( ) ?;
170+
171+ Some ( or ( gt_eq ( src_min, lit ( succ) ) , lt ( src_max, lit ( prefix) ) ) )
172+ }
173+ }
174+ }
175+ }
176+
177+ /// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
178+ #[ derive( Debug , PartialEq ) ]
179+ enum LikeVariant < ' a > {
180+ Exact ( & ' a str ) ,
181+ Prefix ( & ' a str ) ,
182+ }
183+
184+ impl < ' a > LikeVariant < ' a > {
185+ /// Parse a LIKE pattern string into its relevant variant
186+ fn from_str ( string : & str ) -> Option < LikeVariant < ' _ > > {
187+ let Some ( wildcard_pos) = string. find ( [ '%' , '_' ] ) else {
188+ return Some ( LikeVariant :: Exact ( string) ) ;
189+ } ;
190+
191+ // Can't handle wildcard in the front.
192+ if wildcard_pos == 0 {
193+ return None ;
194+ }
195+
196+ let prefix = & string[ ..wildcard_pos] ;
197+ Some ( LikeVariant :: Prefix ( prefix) )
198+ }
130199}
131200
132201pub fn like ( child : Expression , pattern : Expression ) -> Expression {
@@ -176,12 +245,17 @@ mod tests {
176245
177246 use crate :: ToCanonical ;
178247 use crate :: arrays:: BoolArray ;
248+ use crate :: expr:: col;
179249 use crate :: expr:: exprs:: get_item:: get_item;
250+ use crate :: expr:: exprs:: like:: LikeVariant ;
180251 use crate :: expr:: exprs:: like:: like;
181252 use crate :: expr:: exprs:: like:: not_ilike;
182253 use crate :: expr:: exprs:: literal:: lit;
183254 use crate :: expr:: exprs:: not:: not;
184255 use crate :: expr:: exprs:: root:: root;
256+ use crate :: expr:: ilike;
257+ use crate :: expr:: not_like;
258+ use crate :: expr:: pruning:: pruning_expr:: TrackingStatsCatalog ;
185259
186260 #[ test]
187261 fn invert_booleans ( ) {
@@ -217,4 +291,66 @@ mod tests {
217291 let expr2 = not_ilike ( root ( ) , lit ( "test*" ) ) ;
218292 assert_eq ! ( expr2. to_string( ) , "$ not ilike \" test*\" " ) ;
219293 }
294+
295+ #[ test]
296+ fn test_like_variant ( ) {
297+ // Supported patterns
298+ assert_eq ! (
299+ LikeVariant :: from_str( "simple" ) ,
300+ Some ( LikeVariant :: Exact ( "simple" ) )
301+ ) ;
302+ assert_eq ! (
303+ LikeVariant :: from_str( "prefix%" ) ,
304+ Some ( LikeVariant :: Prefix ( "prefix" ) )
305+ ) ;
306+ assert_eq ! (
307+ LikeVariant :: from_str( "first%rest_stuff" ) ,
308+ Some ( LikeVariant :: Prefix ( "first" ) )
309+ ) ;
310+
311+ // Unsupported patterns
312+ assert_eq ! ( LikeVariant :: from_str( "%suffix" ) , None ) ;
313+ assert_eq ! ( LikeVariant :: from_str( "_pattern" ) , None ) ;
314+ }
315+
316+ #[ test]
317+ fn test_like_pushdown ( ) {
318+ // Test that LIKE prefix and exactness filters can be pushed down into stats filtering
319+ // at scan time.
320+ let catalog = TrackingStatsCatalog :: default ( ) ;
321+
322+ let pruning_expr = like ( col ( "a" ) , lit ( "prefix%" ) )
323+ . stat_falsification ( & catalog)
324+ . expect ( "LIKE stat falsification" ) ;
325+
326+ insta:: assert_snapshot!( pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"# ) ;
327+
328+ // Multiple wildcards
329+ let pruning_expr = like ( col ( "a" ) , lit ( "pref%ix%" ) )
330+ . stat_falsification ( & catalog)
331+ . expect ( "LIKE stat falsification" ) ;
332+ insta:: assert_snapshot!( pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"# ) ;
333+
334+ let pruning_expr = like ( col ( "a" ) , lit ( "pref_ix_" ) )
335+ . stat_falsification ( & catalog)
336+ . expect ( "LIKE stat falsification" ) ;
337+ insta:: assert_snapshot!( pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"# ) ;
338+
339+ // Exact match
340+ let pruning_expr = like ( col ( "a" ) , lit ( "exactly" ) )
341+ . stat_falsification ( & catalog)
342+ . expect ( "LIKE stat falsification" ) ;
343+ insta:: assert_snapshot!( pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"# ) ;
344+
345+ // Suffix search skips pushdown
346+ let pruning_expr = like ( col ( "a" ) , lit ( "%suffix" ) ) . stat_falsification ( & catalog) ;
347+ assert_eq ! ( pruning_expr, None ) ;
348+
349+ // NOT LIKE, ILIKE not supported currently
350+ assert_eq ! (
351+ None ,
352+ not_like( col( "a" ) , lit( "a" ) ) . stat_falsification( & catalog)
353+ ) ;
354+ assert_eq ! ( None , ilike( col( "a" ) , lit( "a" ) ) . stat_falsification( & catalog) ) ;
355+ }
220356}
0 commit comments