@@ -28,6 +28,7 @@ use crate::error::Result;
2828use crate :: projection:: ProjectionMask ;
2929use crate :: reader:: metadata:: { read_metadata, FileMetadata } ;
3030use crate :: reader:: ChunkReader ;
31+ use crate :: row_selection:: RowSelection ;
3132use crate :: schema:: RootDataType ;
3233use crate :: stripe:: { Stripe , StripeMetadata } ;
3334
@@ -40,6 +41,7 @@ pub struct ArrowReaderBuilder<R> {
4041 pub ( crate ) projection : ProjectionMask ,
4142 pub ( crate ) schema_ref : Option < SchemaRef > ,
4243 pub ( crate ) file_byte_range : Option < Range < usize > > ,
44+ pub ( crate ) row_selection : Option < RowSelection > ,
4345}
4446
4547impl < R > ArrowReaderBuilder < R > {
@@ -51,6 +53,7 @@ impl<R> ArrowReaderBuilder<R> {
5153 projection : ProjectionMask :: all ( ) ,
5254 schema_ref : None ,
5355 file_byte_range : None ,
56+ row_selection : None ,
5457 }
5558 }
5659
@@ -79,6 +82,33 @@ impl<R> ArrowReaderBuilder<R> {
7982 self
8083 }
8184
85+ /// Set a [`RowSelection`] to filter rows
86+ ///
87+ /// The [`RowSelection`] specifies which rows should be decoded from the ORC file.
88+ /// This can be used to skip rows that don't match predicates, reducing I/O and
89+ /// improving query performance.
90+ ///
91+ /// # Example
92+ ///
93+ /// ```no_run
94+ /// # use std::fs::File;
95+ /// # use orc_rust::arrow_reader::ArrowReaderBuilder;
96+ /// # use orc_rust::row_selection::{RowSelection, RowSelector};
97+ /// let file = File::open("data.orc").unwrap();
98+ /// let selection = vec![
99+ /// RowSelector::skip(100),
100+ /// RowSelector::select(50),
101+ /// ].into();
102+ /// let reader = ArrowReaderBuilder::try_new(file)
103+ /// .unwrap()
104+ /// .with_row_selection(selection)
105+ /// .build();
106+ /// ```
107+ pub fn with_row_selection ( mut self , row_selection : RowSelection ) -> Self {
108+ self . row_selection = Some ( row_selection) ;
109+ self
110+ }
111+
82112 /// Returns the currently computed schema
83113 ///
84114 /// Unless [`with_schema`](Self::with_schema) was called, this is computed dynamically
@@ -124,6 +154,7 @@ impl<R: ChunkReader> ArrowReaderBuilder<R> {
124154 schema_ref,
125155 current_stripe : None ,
126156 batch_size : self . batch_size ,
157+ row_selection : self . row_selection ,
127158 }
128159 }
129160}
@@ -133,6 +164,7 @@ pub struct ArrowReader<R> {
133164 schema_ref : SchemaRef ,
134165 current_stripe : Option < Box < dyn Iterator < Item = Result < RecordBatch > > + Send > > ,
135166 batch_size : usize ,
167+ row_selection : Option < RowSelection > ,
136168}
137169
138170impl < R > ArrowReader < R > {
@@ -146,8 +178,22 @@ impl<R: ChunkReader> ArrowReader<R> {
146178 let stripe = self . cursor . next ( ) . transpose ( ) ?;
147179 match stripe {
148180 Some ( stripe) => {
149- let decoder =
150- NaiveStripeDecoder :: new ( stripe, self . schema_ref . clone ( ) , self . batch_size ) ?;
181+ // Split off the row selection for this stripe
182+ let stripe_rows = stripe. number_of_rows ( ) ;
183+ let selection = self . row_selection . as_mut ( ) . and_then ( |s| {
184+ if s. row_count ( ) > 0 {
185+ Some ( s. split_off ( stripe_rows) )
186+ } else {
187+ None
188+ }
189+ } ) ;
190+
191+ let decoder = NaiveStripeDecoder :: new_with_selection (
192+ stripe,
193+ self . schema_ref . clone ( ) ,
194+ self . batch_size ,
195+ selection,
196+ ) ?;
151197 self . current_stripe = Some ( Box :: new ( decoder) ) ;
152198 self . next ( ) . transpose ( )
153199 }
0 commit comments