@@ -225,8 +225,21 @@ fn evaluate_comparison_with_stats(
225225 }
226226
227227 // String comparisons
228- TypeStatistics :: String { min, max, .. } => match value {
229- PredicateValue :: Utf8 ( Some ( v) ) => evaluate_string_comparison ( min, max, op, v) ,
228+ TypeStatistics :: String {
229+ lower_bound,
230+ upper_bound,
231+ is_exact_min,
232+ is_exact_max,
233+ ..
234+ } => match value {
235+ PredicateValue :: Utf8 ( Some ( v) ) => evaluate_string_comparison (
236+ lower_bound,
237+ upper_bound,
238+ op,
239+ v,
240+ * is_exact_min,
241+ * is_exact_max,
242+ ) ,
230243 _ => {
231244 return Err ( UnexpectedSnafu {
232245 msg : "Type mismatch: expected string value" . to_string ( ) ,
@@ -271,7 +284,7 @@ fn evaluate_comparison_with_stats(
271284 PredicateValue :: Utf8 ( Some ( v) ) => {
272285 // For decimal, we need to compare strings
273286 // This is a simplified implementation
274- evaluate_string_comparison ( min, max, op, v)
287+ evaluate_string_comparison ( min, max, op, v, true , true )
275288 }
276289 _ => {
277290 return Err ( UnexpectedSnafu {
@@ -437,31 +450,41 @@ fn evaluate_float_comparison(min: f64, max: f64, op: ComparisonOp, value: f64) -
437450 }
438451}
439452
440- fn evaluate_string_comparison ( min : & str , max : & str , op : ComparisonOp , value : & str ) -> bool {
453+ fn evaluate_string_comparison (
454+ lower_bound : & str ,
455+ upper_bound : & str ,
456+ op : ComparisonOp ,
457+ value : & str ,
458+ is_exact_min : bool ,
459+ is_exact_max : bool ,
460+ ) -> bool {
461+ // Check if the column's minimum is <= value
462+ let min_le_value = lower_bound < value || ( lower_bound == value && is_exact_min) ;
463+
464+ // Check if the column's maximum is >= value
465+ let max_ge_value = upper_bound > value || ( upper_bound == value && is_exact_max) ;
466+
441467 match op {
442- ComparisonOp :: Equal => {
443- // col = value: keep if value is within [min, max] lexicographically
444- min <= value && value <= max
445- }
468+ // Range intersection: The value must be reachable from both sides.
469+ ComparisonOp :: Equal => min_le_value && max_ge_value,
470+
471+ // One-sided inclusive checks reuse the logic above.
472+ ComparisonOp :: LessThanOrEqual => min_le_value,
473+ ComparisonOp :: GreaterThanOrEqual => max_ge_value,
474+
475+ // Strict checks are simple.
476+ // Note: We don't need to check exactness here.
477+ // e.g., for LessThan: if lower_bound == value, then actual_min >= value,
478+ // so NO row can be strictly less than value.
479+ ComparisonOp :: LessThan => lower_bound < value,
480+ ComparisonOp :: GreaterThan => upper_bound > value,
481+
482+ // Special case: Only prune != if we are certain the column contains ONLY `value`.
446483 ComparisonOp :: NotEqual => {
447- // col != value: keep if value is not the only value
448- !( min == value && max == value)
449- }
450- ComparisonOp :: LessThan => {
451- // col < value: keep if min < value
452- min < value
453- }
454- ComparisonOp :: LessThanOrEqual => {
455- // col <= value: keep if min <= value
456- min <= value
457- }
458- ComparisonOp :: GreaterThan => {
459- // col > value: keep if max > value
460- max > value
461- }
462- ComparisonOp :: GreaterThanOrEqual => {
463- // col >= value: keep if max >= value
464- max >= value
484+ let is_single_value_col = lower_bound == upper_bound && is_exact_min && is_exact_max;
485+
486+ // Keep unless it's a single-value column exactly matching the target
487+ !( is_single_value_col && lower_bound == value)
465488 }
466489 }
467490}
@@ -1015,26 +1038,143 @@ mod tests {
10151038 }
10161039
10171040 #[ test]
1018- fn test_evaluate_predicate_missing_statistics ( ) {
1019- use crate :: predicate:: { Predicate , PredicateValue } ;
1020- use crate :: row_index:: { RowGroupEntry , RowGroupIndex } ;
1021- use std:: collections:: HashMap ;
1022-
1023- // Create row index with missing statistics
1024- let mut columns = HashMap :: new ( ) ;
1025- let entries = vec ! [
1026- RowGroupEntry :: new( None , vec![ ] ) , // No statistics
1027- ] ;
1028- columns. insert ( 1 , RowGroupIndex :: new ( entries, 10000 , 1 ) ) ;
1029- let row_index = StripeRowIndex :: new ( columns, 10000 , 10000 ) ;
1030- let schema = create_test_schema ( ) ;
1031-
1032- // Test: age > 10
1033- // Should keep row group when statistics are missing (conservative)
1034- let predicate = Predicate :: gt ( "age" , PredicateValue :: Int32 ( Some ( 10 ) ) ) ;
1035- let result = super :: evaluate_predicate ( & predicate, & row_index, & schema) . unwrap ( ) ;
1041+ fn test_evaluate_string_comparison ( ) {
1042+ use crate :: predicate:: ComparisonOp ;
1043+
1044+ // Helper to make the call shorter
1045+ let eval = |lower : & str ,
1046+ upper : & str ,
1047+ op : ComparisonOp ,
1048+ val : & str ,
1049+ exact_min : bool ,
1050+ exact_max : bool | {
1051+ super :: evaluate_string_comparison ( lower, upper, op, val, exact_min, exact_max)
1052+ } ;
10361053
1037- assert_eq ! ( result. len( ) , 1 ) ;
1038- assert ! ( result[ 0 ] ) ; // Keep when statistics missing
1054+ // 1. EQUAL
1055+ // Range ["a", "c"], value "b" -> Keep
1056+ assert ! ( eval( "a" , "c" , ComparisonOp :: Equal , "b" , true , true ) ) ;
1057+ // Range ["a", "c"], value "d" -> Skip
1058+ assert ! ( !eval( "a" , "c" , ComparisonOp :: Equal , "d" , true , true ) ) ;
1059+ // Range ["a", "c"], value "a" -> Keep
1060+ assert ! ( eval( "a" , "c" , ComparisonOp :: Equal , "a" , true , true ) ) ;
1061+ // Range ["a", "c"], value "c" -> Keep
1062+ assert ! ( eval( "a" , "c" , ComparisonOp :: Equal , "c" , true , true ) ) ;
1063+
1064+ // Truncated stats (inexact)
1065+ // Range ["a", "c"] (min inexact), value "a" -> Skip (actual min > "a")
1066+ assert ! ( !eval( "a" , "c" , ComparisonOp :: Equal , "a" , false , true ) ) ;
1067+ // Range ["a", "c"] (max inexact), value "c" -> Skip (actual max < "c" max is rounded up)
1068+ assert ! ( !eval( "a" , "c" , ComparisonOp :: Equal , "c" , true , false ) ) ;
1069+
1070+ // 2. LESS THAN (< value)
1071+ // Range ["a", "c"], value "b". "a" < "b" -> Keep.
1072+ assert ! ( eval( "a" , "c" , ComparisonOp :: LessThan , "b" , true , true ) ) ;
1073+ // Range ["d", "e"], value "b". "d" >= "b" -> Skip.
1074+ assert ! ( !eval( "d" , "e" , ComparisonOp :: LessThan , "b" , true , true ) ) ;
1075+ // Range ["a", "c"], value "a". "a" < "a" is false.
1076+ // If exact, min="a", so no value < "a". Skip.
1077+ assert ! ( !eval( "a" , "c" , ComparisonOp :: LessThan , "a" , true , true ) ) ;
1078+ // If not exact, min="a" (truncated). Actual min >= "a".
1079+ // So actual min >= value. No value < "a". Skip.
1080+ assert ! ( !eval( "a" , "c" , ComparisonOp :: LessThan , "a" , false , true ) ) ;
1081+
1082+ // 3. GREATER THAN (> value)
1083+ // Range ["a", "c"], value "b". "c" > "b" -> Keep.
1084+ assert ! ( eval( "a" , "c" , ComparisonOp :: GreaterThan , "b" , true , true ) ) ;
1085+ // Range ["a", "b"], value "c". "b" <= "c" -> Skip.
1086+ assert ! ( !eval( "a" , "b" , ComparisonOp :: GreaterThan , "c" , true , true ) ) ;
1087+ // Range ["a", "c"], value "c". "c" > "c" is false.
1088+ // If exact, max="c", so no value > "c". Skip.
1089+ assert ! ( !eval( "a" , "c" , ComparisonOp :: GreaterThan , "c" , true , true ) ) ;
1090+ // If not exact, actual max < "c". Skip.
1091+ assert ! ( !eval( "a" , "c" , ComparisonOp :: GreaterThan , "c" , true , false ) ) ;
1092+
1093+ // 4. NOT EQUAL
1094+ // Range ["a", "c"], value "b". Keep.
1095+ assert ! ( eval( "a" , "c" , ComparisonOp :: NotEqual , "b" , true , true ) ) ;
1096+ // Range ["a", "a"], value "a".
1097+ // Exact: Skip.
1098+ assert ! ( !eval( "a" , "a" , ComparisonOp :: NotEqual , "a" , true , true ) ) ;
1099+
1100+ // 5. LESS THAN OR EQUAL (<= value)
1101+ // Range ["a", "c"], value "b". Keep.
1102+ assert ! ( eval(
1103+ "a" ,
1104+ "c" ,
1105+ ComparisonOp :: LessThanOrEqual ,
1106+ "b" ,
1107+ true ,
1108+ true
1109+ ) ) ;
1110+ // Range ["a", "c"], value "a". Keep.
1111+ assert ! ( eval(
1112+ "a" ,
1113+ "c" ,
1114+ ComparisonOp :: LessThanOrEqual ,
1115+ "a" ,
1116+ true ,
1117+ true
1118+ ) ) ;
1119+ // Range ["b", "c"], value "a". "b" > "a". Skip.
1120+ assert ! ( !eval(
1121+ "b" ,
1122+ "c" ,
1123+ ComparisonOp :: LessThanOrEqual ,
1124+ "a" ,
1125+ true ,
1126+ true
1127+ ) ) ;
1128+ // Inexact min:
1129+ // Range ["a", "c"] (min inexact), value "a".
1130+ // Actual_min > "a". Skip.
1131+ assert ! ( !eval(
1132+ "a" ,
1133+ "c" ,
1134+ ComparisonOp :: LessThanOrEqual ,
1135+ "a" ,
1136+ false ,
1137+ true
1138+ ) ) ;
1139+
1140+ // 6. GREATER THAN OR EQUAL (>= value)
1141+ // Range ["a", "c"], value "b". Keep.
1142+ assert ! ( eval(
1143+ "a" ,
1144+ "c" ,
1145+ ComparisonOp :: GreaterThanOrEqual ,
1146+ "b" ,
1147+ true ,
1148+ true
1149+ ) ) ;
1150+ // Range ["a", "c"], value "c". Keep.
1151+ assert ! ( eval(
1152+ "a" ,
1153+ "c" ,
1154+ ComparisonOp :: GreaterThanOrEqual ,
1155+ "c" ,
1156+ true ,
1157+ true
1158+ ) ) ;
1159+ // Range ["a", "b"], value "c". "b" < "c". Skip.
1160+ assert ! ( !eval(
1161+ "a" ,
1162+ "b" ,
1163+ ComparisonOp :: GreaterThanOrEqual ,
1164+ "c" ,
1165+ true ,
1166+ true
1167+ ) ) ;
1168+ // Inexact max:
1169+ // Range ["a", "b"] (max inexact), value "b".
1170+ // Actual_max < "b". Skip.
1171+ assert ! ( !eval(
1172+ "a" ,
1173+ "b" ,
1174+ ComparisonOp :: GreaterThanOrEqual ,
1175+ "b" ,
1176+ true ,
1177+ false
1178+ ) ) ;
10391179 }
10401180}
0 commit comments