@@ -25,9 +25,11 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader};
2525
2626use crate :: array_decoder:: NaiveStripeDecoder ;
2727use crate :: error:: Result ;
28+ use crate :: predicate:: Predicate ;
2829use crate :: projection:: ProjectionMask ;
2930use crate :: reader:: metadata:: { read_metadata, FileMetadata } ;
3031use crate :: reader:: ChunkReader ;
32+ use crate :: row_group_filter:: evaluate_predicate;
3133use crate :: row_selection:: RowSelection ;
3234use crate :: schema:: RootDataType ;
3335use crate :: stripe:: { Stripe , StripeMetadata } ;
@@ -42,6 +44,7 @@ pub struct ArrowReaderBuilder<R> {
4244 pub ( crate ) schema_ref : Option < SchemaRef > ,
4345 pub ( crate ) file_byte_range : Option < Range < usize > > ,
4446 pub ( crate ) row_selection : Option < RowSelection > ,
47+ pub ( crate ) predicate : Option < Predicate > ,
4548}
4649
4750impl < R > ArrowReaderBuilder < R > {
@@ -54,6 +57,7 @@ impl<R> ArrowReaderBuilder<R> {
5457 schema_ref : None ,
5558 file_byte_range : None ,
5659 row_selection : None ,
60+ predicate : None ,
5761 }
5862 }
5963
@@ -109,6 +113,44 @@ impl<R> ArrowReaderBuilder<R> {
109113 self
110114 }
111115
116+ /// Set a predicate for row group filtering
117+ ///
118+ /// The predicate will be evaluated against row group statistics to automatically
119+ /// generate a [`RowSelection`] that skips filtered row groups. This provides
120+ /// efficient predicate pushdown based on ORC row indexes.
121+ ///
122+ /// The predicate is evaluated lazily when each stripe is read, using the row group
123+ /// statistics from the stripe's index section.
124+ ///
125+ /// If both `with_predicate()` and `with_row_selection()` are called, the results
126+ /// are combined using logical AND (both conditions must be satisfied).
127+ ///
128+ /// # Example
129+ ///
130+ /// ```no_run
131+ /// # use std::fs::File;
132+ /// # use orc_rust::{ArrowReaderBuilder, Predicate, PredicateValue};
133+ /// let file = File::open("data.orc").unwrap();
134+ ///
135+ /// // Filter: age >= 18
136+ /// let predicate = Predicate::gte("age", PredicateValue::Int32(Some(18)));
137+ ///
138+ /// let reader = ArrowReaderBuilder::try_new(file)
139+ /// .unwrap()
140+ /// .with_predicate(predicate)
141+ /// .build();
142+ /// ```
143+ ///
144+ /// # Notes
145+ ///
146+ /// - Predicate evaluation requires row indexes to be present in the ORC file
147+ /// - If row indexes are missing, the predicate is ignored (all row groups are kept)
148+ /// - Only primitive columns have row indexes; predicates on compound types may be limited
149+ pub fn with_predicate ( mut self , predicate : Predicate ) -> Self {
150+ self . predicate = Some ( predicate) ;
151+ self
152+ }
153+
112154 /// Returns the currently computed schema
113155 ///
114156 /// Unless [`with_schema`](Self::with_schema) was called, this is computed dynamically
@@ -142,6 +184,7 @@ impl<R: ChunkReader> ArrowReaderBuilder<R> {
142184 . file_metadata
143185 . root_data_type ( )
144186 . project ( & self . projection ) ;
187+ let projected_data_type_clone = projected_data_type. clone ( ) ;
145188 let cursor = Cursor {
146189 reader : self . reader ,
147190 file_metadata : self . file_metadata ,
@@ -155,6 +198,8 @@ impl<R: ChunkReader> ArrowReaderBuilder<R> {
155198 current_stripe : None ,
156199 batch_size : self . batch_size ,
157200 row_selection : self . row_selection ,
201+ predicate : self . predicate ,
202+ projected_data_type : projected_data_type_clone,
158203 }
159204 }
160205}
@@ -165,6 +210,8 @@ pub struct ArrowReader<R> {
165210 current_stripe : Option < Box < dyn Iterator < Item = Result < RecordBatch > > + Send > > ,
166211 batch_size : usize ,
167212 row_selection : Option < RowSelection > ,
213+ predicate : Option < Predicate > ,
214+ projected_data_type : RootDataType ,
168215}
169216
170217impl < R > ArrowReader < R > {
@@ -178,21 +225,63 @@ impl<R: ChunkReader> ArrowReader<R> {
178225 let stripe = self . cursor . next ( ) . transpose ( ) ?;
179226 match stripe {
180227 Some ( stripe) => {
181- // Split off the row selection for this stripe
182228 let stripe_rows = stripe. number_of_rows ( ) ;
183- let selection = self . row_selection . as_mut ( ) . and_then ( |s| {
184- if s. row_count ( ) > 0 {
185- Some ( s. split_off ( stripe_rows) )
186- } else {
187- None
229+
230+ // Evaluate predicate if present
231+ let mut stripe_selection: Option < RowSelection > = None ;
232+ if let Some ( ref predicate) = self . predicate {
233+ // Try to read row indexes for this stripe
234+ match stripe. read_row_indexes ( & self . cursor . file_metadata ) {
235+ Ok ( row_index) => {
236+ // Evaluate predicate against row group statistics
237+ match evaluate_predicate ( predicate, & row_index, & self . projected_data_type ) {
238+ Ok ( row_group_filter) => {
239+ // Generate RowSelection from filter results
240+ let rows_per_group = self
241+ . cursor
242+ . file_metadata
243+ . row_index_stride ( )
244+ . unwrap_or ( 10_000 ) ;
245+ stripe_selection = Some ( RowSelection :: from_row_group_filter (
246+ & row_group_filter,
247+ rows_per_group,
248+ stripe_rows,
249+ ) ) ;
250+ }
251+ Err ( _) => {
252+ // Predicate evaluation failed (e.g., column not found)
253+ // Keep all rows (maybe)
254+ stripe_selection = Some ( RowSelection :: select_all ( stripe_rows) ) ;
255+ }
256+ }
257+ }
258+ Err ( _) => {
259+ // Row indexes not available, keep all rows (maybe)
260+ stripe_selection = Some ( RowSelection :: select_all ( stripe_rows) ) ;
261+ }
262+ }
263+ }
264+
265+ // Combine with existing row_selection if present
266+ let mut final_selection = stripe_selection;
267+ if let Some ( ref mut existing_selection) = self . row_selection {
268+ if existing_selection. row_count ( ) > 0 {
269+ let existing_for_stripe = existing_selection. split_off ( stripe_rows) ;
270+ final_selection = match final_selection {
271+ Some ( predicate_selection) => {
272+ // Both predicate and manual selection: combine with AND
273+ Some ( existing_for_stripe. and_then ( & predicate_selection) )
274+ }
275+ None => Some ( existing_for_stripe) ,
276+ } ;
188277 }
189- } ) ;
278+ }
190279
191280 let decoder = NaiveStripeDecoder :: new_with_selection (
192281 stripe,
193282 self . schema_ref . clone ( ) ,
194283 self . batch_size ,
195- selection ,
284+ final_selection ,
196285 ) ?;
197286 self . current_stripe = Some ( Box :: new ( decoder) ) ;
198287 self . next ( ) . transpose ( )
0 commit comments