Skip to content

Commit cf0e6c9

Browse files
committed
fix string statistics comparison
1 parent 1b7a553 commit cf0e6c9

2 files changed

Lines changed: 209 additions & 51 deletions

File tree

src/row_group_filter.rs

Lines changed: 186 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,21 @@ fn evaluate_comparison_with_stats(
225225
}
226226

227227
// String comparisons
228-
TypeStatistics::String { min, max, .. } => match value {
229-
PredicateValue::Utf8(Some(v)) => evaluate_string_comparison(min, max, op, v),
228+
TypeStatistics::String {
229+
lower_bound,
230+
upper_bound,
231+
is_exact_min,
232+
is_exact_max,
233+
..
234+
} => match value {
235+
PredicateValue::Utf8(Some(v)) => evaluate_string_comparison(
236+
lower_bound,
237+
upper_bound,
238+
op,
239+
v,
240+
*is_exact_min,
241+
*is_exact_max,
242+
),
230243
_ => {
231244
return Err(UnexpectedSnafu {
232245
msg: "Type mismatch: expected string value".to_string(),
@@ -271,7 +284,7 @@ fn evaluate_comparison_with_stats(
271284
PredicateValue::Utf8(Some(v)) => {
272285
// For decimal, we need to compare strings
273286
// This is a simplified implementation
274-
evaluate_string_comparison(min, max, op, v)
287+
evaluate_string_comparison(min, max, op, v, true, true)
275288
}
276289
_ => {
277290
return Err(UnexpectedSnafu {
@@ -437,31 +450,41 @@ fn evaluate_float_comparison(min: f64, max: f64, op: ComparisonOp, value: f64) -
437450
}
438451
}
439452

440-
fn evaluate_string_comparison(min: &str, max: &str, op: ComparisonOp, value: &str) -> bool {
453+
fn evaluate_string_comparison(
454+
lower_bound: &str,
455+
upper_bound: &str,
456+
op: ComparisonOp,
457+
value: &str,
458+
is_exact_min: bool,
459+
is_exact_max: bool,
460+
) -> bool {
461+
// Check if the column's minimum is <= value
462+
let min_le_value = lower_bound < value || (lower_bound == value && is_exact_min);
463+
464+
// Check if the column's maximum is >= value
465+
let max_ge_value = upper_bound > value || (upper_bound == value && is_exact_max);
466+
441467
match op {
442-
ComparisonOp::Equal => {
443-
// col = value: keep if value is within [min, max] lexicographically
444-
min <= value && value <= max
445-
}
468+
// Range intersection: The value must be reachable from both sides.
469+
ComparisonOp::Equal => min_le_value && max_ge_value,
470+
471+
// One-sided inclusive checks reuse the logic above.
472+
ComparisonOp::LessThanOrEqual => min_le_value,
473+
ComparisonOp::GreaterThanOrEqual => max_ge_value,
474+
475+
// Strict checks are simple.
476+
// Note: We don't need to check exactness here.
477+
// e.g., for LessThan: if lower_bound == value, then actual_min >= value,
478+
// so NO row can be strictly less than value.
479+
ComparisonOp::LessThan => lower_bound < value,
480+
ComparisonOp::GreaterThan => upper_bound > value,
481+
482+
// Special case: Only prune != if we are certain the column contains ONLY `value`.
446483
ComparisonOp::NotEqual => {
447-
// col != value: keep if value is not the only value
448-
!(min == value && max == value)
449-
}
450-
ComparisonOp::LessThan => {
451-
// col < value: keep if min < value
452-
min < value
453-
}
454-
ComparisonOp::LessThanOrEqual => {
455-
// col <= value: keep if min <= value
456-
min <= value
457-
}
458-
ComparisonOp::GreaterThan => {
459-
// col > value: keep if max > value
460-
max > value
461-
}
462-
ComparisonOp::GreaterThanOrEqual => {
463-
// col >= value: keep if max >= value
464-
max >= value
484+
let is_single_value_col = lower_bound == upper_bound && is_exact_min && is_exact_max;
485+
486+
// Keep unless it's a single-value column exactly matching the target
487+
!(is_single_value_col && lower_bound == value)
465488
}
466489
}
467490
}
@@ -1015,26 +1038,143 @@ mod tests {
10151038
}
10161039

10171040
#[test]
1018-
fn test_evaluate_predicate_missing_statistics() {
1019-
use crate::predicate::{Predicate, PredicateValue};
1020-
use crate::row_index::{RowGroupEntry, RowGroupIndex};
1021-
use std::collections::HashMap;
1022-
1023-
// Create row index with missing statistics
1024-
let mut columns = HashMap::new();
1025-
let entries = vec![
1026-
RowGroupEntry::new(None, vec![]), // No statistics
1027-
];
1028-
columns.insert(1, RowGroupIndex::new(entries, 10000, 1));
1029-
let row_index = StripeRowIndex::new(columns, 10000, 10000);
1030-
let schema = create_test_schema();
1031-
1032-
// Test: age > 10
1033-
// Should keep row group when statistics are missing (conservative)
1034-
let predicate = Predicate::gt("age", PredicateValue::Int32(Some(10)));
1035-
let result = super::evaluate_predicate(&predicate, &row_index, &schema).unwrap();
1041+
fn test_evaluate_string_comparison() {
1042+
use crate::predicate::ComparisonOp;
1043+
1044+
// Helper to make the call shorter
1045+
let eval = |lower: &str,
1046+
upper: &str,
1047+
op: ComparisonOp,
1048+
val: &str,
1049+
exact_min: bool,
1050+
exact_max: bool| {
1051+
super::evaluate_string_comparison(lower, upper, op, val, exact_min, exact_max)
1052+
};
10361053

1037-
assert_eq!(result.len(), 1);
1038-
assert!(result[0]); // Keep when statistics missing
1054+
// 1. EQUAL
1055+
// Range ["a", "c"], value "b" -> Keep
1056+
assert!(eval("a", "c", ComparisonOp::Equal, "b", true, true));
1057+
// Range ["a", "c"], value "d" -> Skip
1058+
assert!(!eval("a", "c", ComparisonOp::Equal, "d", true, true));
1059+
// Range ["a", "c"], value "a" -> Keep
1060+
assert!(eval("a", "c", ComparisonOp::Equal, "a", true, true));
1061+
// Range ["a", "c"], value "c" -> Keep
1062+
assert!(eval("a", "c", ComparisonOp::Equal, "c", true, true));
1063+
1064+
// Truncated stats (inexact)
1065+
// Range ["a", "c"] (min inexact), value "a" -> Skip (actual min > "a")
1066+
assert!(!eval("a", "c", ComparisonOp::Equal, "a", false, true));
1067+
// Range ["a", "c"] (max inexact), value "c" -> Skip (actual max < "c" max is rounded up)
1068+
assert!(!eval("a", "c", ComparisonOp::Equal, "c", true, false));
1069+
1070+
// 2. LESS THAN (< value)
1071+
// Range ["a", "c"], value "b". "a" < "b" -> Keep.
1072+
assert!(eval("a", "c", ComparisonOp::LessThan, "b", true, true));
1073+
// Range ["d", "e"], value "b". "d" >= "b" -> Skip.
1074+
assert!(!eval("d", "e", ComparisonOp::LessThan, "b", true, true));
1075+
// Range ["a", "c"], value "a". "a" < "a" is false.
1076+
// If exact, min="a", so no value < "a". Skip.
1077+
assert!(!eval("a", "c", ComparisonOp::LessThan, "a", true, true));
1078+
// If not exact, min="a" (truncated). Actual min >= "a".
1079+
// So actual min >= value. No value < "a". Skip.
1080+
assert!(!eval("a", "c", ComparisonOp::LessThan, "a", false, true));
1081+
1082+
// 3. GREATER THAN (> value)
1083+
// Range ["a", "c"], value "b". "c" > "b" -> Keep.
1084+
assert!(eval("a", "c", ComparisonOp::GreaterThan, "b", true, true));
1085+
// Range ["a", "b"], value "c". "b" <= "c" -> Skip.
1086+
assert!(!eval("a", "b", ComparisonOp::GreaterThan, "c", true, true));
1087+
// Range ["a", "c"], value "c". "c" > "c" is false.
1088+
// If exact, max="c", so no value > "c". Skip.
1089+
assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, true));
1090+
// If not exact, actual max < "c". Skip.
1091+
assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, false));
1092+
1093+
// 4. NOT EQUAL
1094+
// Range ["a", "c"], value "b". Keep.
1095+
assert!(eval("a", "c", ComparisonOp::NotEqual, "b", true, true));
1096+
// Range ["a", "a"], value "a".
1097+
// Exact: Skip.
1098+
assert!(!eval("a", "a", ComparisonOp::NotEqual, "a", true, true));
1099+
1100+
// 5. LESS THAN OR EQUAL (<= value)
1101+
// Range ["a", "c"], value "b". Keep.
1102+
assert!(eval(
1103+
"a",
1104+
"c",
1105+
ComparisonOp::LessThanOrEqual,
1106+
"b",
1107+
true,
1108+
true
1109+
));
1110+
// Range ["a", "c"], value "a". Keep.
1111+
assert!(eval(
1112+
"a",
1113+
"c",
1114+
ComparisonOp::LessThanOrEqual,
1115+
"a",
1116+
true,
1117+
true
1118+
));
1119+
// Range ["b", "c"], value "a". "b" > "a". Skip.
1120+
assert!(!eval(
1121+
"b",
1122+
"c",
1123+
ComparisonOp::LessThanOrEqual,
1124+
"a",
1125+
true,
1126+
true
1127+
));
1128+
// Inexact min:
1129+
// Range ["a", "c"] (min inexact), value "a".
1130+
// Actual_min > "a". Skip.
1131+
assert!(!eval(
1132+
"a",
1133+
"c",
1134+
ComparisonOp::LessThanOrEqual,
1135+
"a",
1136+
false,
1137+
true
1138+
));
1139+
1140+
// 6. GREATER THAN OR EQUAL (>= value)
1141+
// Range ["a", "c"], value "b". Keep.
1142+
assert!(eval(
1143+
"a",
1144+
"c",
1145+
ComparisonOp::GreaterThanOrEqual,
1146+
"b",
1147+
true,
1148+
true
1149+
));
1150+
// Range ["a", "c"], value "c". Keep.
1151+
assert!(eval(
1152+
"a",
1153+
"c",
1154+
ComparisonOp::GreaterThanOrEqual,
1155+
"c",
1156+
true,
1157+
true
1158+
));
1159+
// Range ["a", "b"], value "c". "b" < "c". Skip.
1160+
assert!(!eval(
1161+
"a",
1162+
"b",
1163+
ComparisonOp::GreaterThanOrEqual,
1164+
"c",
1165+
true,
1166+
true
1167+
));
1168+
// Inexact max:
1169+
// Range ["a", "b"] (max inexact), value "b".
1170+
// Actual_max < "b". Skip.
1171+
assert!(!eval(
1172+
"a",
1173+
"b",
1174+
ComparisonOp::GreaterThanOrEqual,
1175+
"b",
1176+
true,
1177+
false
1178+
));
10391179
}
10401180
}

src/statistics.rs

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,14 @@ pub enum TypeStatistics {
5858
sum: Option<f64>,
5959
},
6060
String {
61-
min: String,
62-
max: String,
61+
lower_bound: String,
62+
upper_bound: String,
6363
/// Total length of all strings
6464
sum: i64,
65+
/// If true, 'min' is an exact minimum. If false, it is a lower bound.
66+
is_exact_min: bool,
67+
/// If true, 'max' is an exact maximum. If false, it is an upper bound.
68+
is_exact_max: bool,
6569
},
6670
/// For Boolean
6771
Bucket { true_count: u64 },
@@ -101,7 +105,9 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
101105
type Error = error::OrcError;
102106

103107
fn try_from(value: &proto::ColumnStatistics) -> Result<Self, Self::Error> {
104-
let type_statistics = if let Some(stats) = &value.int_statistics {
108+
let type_statistics = if value.number_of_values() == 0 {
109+
None
110+
} else if let Some(stats) = &value.int_statistics {
105111
Some(TypeStatistics::Integer {
106112
min: stats.minimum(),
107113
max: stats.maximum(),
@@ -114,10 +120,22 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics {
114120
sum: stats.sum,
115121
})
116122
} else if let Some(stats) = &value.string_statistics {
123+
let (lower_bound, is_exact_min) = stats
124+
.minimum
125+
.as_deref()
126+
.map(|s| (s, true))
127+
.unwrap_or_else(|| (stats.lower_bound(), false));
128+
let (upper_bound, is_exact_max) = stats
129+
.maximum
130+
.as_deref()
131+
.map(|s| (s, true))
132+
.unwrap_or_else(|| (stats.upper_bound(), false));
117133
Some(TypeStatistics::String {
118-
min: stats.minimum().to_owned(),
119-
max: stats.maximum().to_owned(),
134+
lower_bound: lower_bound.to_owned(),
135+
upper_bound: upper_bound.to_owned(),
120136
sum: stats.sum(),
137+
is_exact_min,
138+
is_exact_max,
121139
})
122140
} else if let Some(stats) = &value.bucket_statistics {
123141
// TODO: false count?

0 commit comments

Comments
 (0)