Skip to content

Commit 352a387

Browse files
committed
add regression-test
1 parent 7ca6cc2 commit 352a387

1 file changed

Lines changed: 397 additions & 0 deletions

File tree

tests/row_selection/main.rs

Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Regression tests for RowSelection functionality
19+
20+
use std::fs::File;
21+
22+
use arrow::record_batch::RecordBatch;
23+
use arrow::util::pretty;
24+
use orc_rust::arrow_reader::ArrowReaderBuilder;
25+
use orc_rust::projection::ProjectionMask;
26+
use orc_rust::row_selection::{RowSelection, RowSelector};
27+
28+
fn basic_path(path: &str) -> String {
29+
let dir = env!("CARGO_MANIFEST_DIR");
30+
format!("{dir}/tests/basic/data/{path}")
31+
}
32+
33+
// Helper function to compare batches with expected output
34+
fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
35+
let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();
36+
let actual_lines: Vec<_> = formatted.trim().lines().collect();
37+
assert_eq!(
38+
&actual_lines, expected_lines,
39+
"\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n"
40+
);
41+
}
42+
43+
#[test]
44+
fn test_row_selection_skip_first_select_middle() {
45+
// Skip first 2 rows, select next 2 rows, skip rest
46+
let path = basic_path("test.orc");
47+
let f = File::open(path).expect("no file found");
48+
49+
let selection = vec![
50+
RowSelector::skip(2),
51+
RowSelector::select(2),
52+
RowSelector::skip(1),
53+
]
54+
.into();
55+
56+
let reader = ArrowReaderBuilder::try_new(f)
57+
.unwrap()
58+
.with_row_selection(selection)
59+
.build();
60+
61+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
62+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
63+
64+
// Should only read 2 rows (rows index 2 and 3 from the file)
65+
assert_eq!(total_rows, 2);
66+
67+
// Verify data content - should be rows 2 and 3 (0-indexed)
68+
let expected = [
69+
"+-----+------+------------+-----+----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
70+
"| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple |",
71+
"+-----+------+------------+-----+----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
72+
"| | | | | | | | | | | | | | | 1 | ccc | ccc | 2023-01-01T00:00:00 | 2023-01-01 | 1 |",
73+
"| 4.0 | true | ddd | ccc | bb | ccccc | 5 | -5 | 4 | 2 | 3 | -3 | 3 | -3 | 5 | dddd | bb | 2023-02-01T00:00:00 | 2023-02-01 | 127 |",
74+
"+-----+------+------------+-----+----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
75+
];
76+
assert_batches_eq(&batches, &expected);
77+
}
78+
79+
#[test]
80+
fn test_row_selection_select_all() {
81+
let path = basic_path("test.orc");
82+
let f = File::open(path).expect("no file found");
83+
84+
// Select all 5 rows
85+
let selection = RowSelection::select_all(5);
86+
87+
let reader = ArrowReaderBuilder::try_new(f)
88+
.unwrap()
89+
.with_row_selection(selection)
90+
.build();
91+
92+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
93+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
94+
95+
assert_eq!(total_rows, 5);
96+
}
97+
98+
#[test]
99+
fn test_row_selection_skip_all() {
100+
let path = basic_path("test.orc");
101+
let f = File::open(path).expect("no file found");
102+
103+
// Skip all 5 rows
104+
let selection = RowSelection::skip_all(5);
105+
106+
let reader = ArrowReaderBuilder::try_new(f)
107+
.unwrap()
108+
.with_row_selection(selection)
109+
.build();
110+
111+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
112+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
113+
114+
// Should read 0 rows
115+
assert_eq!(total_rows, 0);
116+
}
117+
118+
#[test]
119+
fn test_row_selection_select_first_only() {
120+
let path = basic_path("test.orc");
121+
let f = File::open(path).expect("no file found");
122+
123+
// Select only first row
124+
let selection = vec![RowSelector::select(1), RowSelector::skip(4)].into();
125+
126+
let reader = ArrowReaderBuilder::try_new(f)
127+
.unwrap()
128+
.with_row_selection(selection)
129+
.build();
130+
131+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
132+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
133+
134+
assert_eq!(total_rows, 1);
135+
136+
let expected = [
137+
"+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+----------------+",
138+
"| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple |",
139+
"+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+----------------+",
140+
"| 1.0 | true | a | a | ddd | aaaaa | 5 | -5 | 1 | 5 | 1 | -1 | 1 | -1 | 5 | a | eeeee | 2023-04-01T20:15:30.002 | 2023-04-01 | -1 |",
141+
"+-----+------+------------+---+-----+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+-------------------------+-------------+----------------+",
142+
];
143+
assert_batches_eq(&batches, &expected);
144+
}
145+
146+
#[test]
147+
fn test_row_selection_select_last_only() {
148+
let path = basic_path("test.orc");
149+
let f = File::open(path).expect("no file found");
150+
151+
// Skip first 4 rows, select last row
152+
let selection = vec![RowSelector::skip(4), RowSelector::select(1)].into();
153+
154+
let reader = ArrowReaderBuilder::try_new(f)
155+
.unwrap()
156+
.with_row_selection(selection)
157+
.build();
158+
159+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
160+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
161+
162+
assert_eq!(total_rows, 1);
163+
164+
let expected = [
165+
"+-----+-------+------------+-----+---+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
166+
"| a | b | str_direct | d | e | f | int_short_repeated | int_neg_short_repeated | int_delta | int_neg_delta | int_direct | int_neg_direct | bigint_direct | bigint_neg_direct | bigint_other | utf8_increase | utf8_decrease | timestamp_simple | date_simple | tinyint_simple |",
167+
"+-----+-------+------------+-----+---+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
168+
"| 5.0 | false | ee | ddd | a | ddddd | 5 | -5 | 5 | 1 | 2 | -2 | 2 | -2 | 5 | eeeee | a | 2023-03-01T00:00:00 | 2023-03-01 | -127 |",
169+
"+-----+-------+------------+-----+---+-------+--------------------+------------------------+-----------+---------------+------------+----------------+---------------+-------------------+--------------+---------------+---------------+---------------------+-------------+----------------+",
170+
];
171+
assert_batches_eq(&batches, &expected);
172+
}
173+
174+
#[test]
175+
fn test_row_selection_with_consecutive_ranges() {
176+
let path = basic_path("test.orc");
177+
let f = File::open(path).expect("no file found");
178+
179+
// Select rows at indices 0-1 and 3-4 (skip row 2)
180+
let selection = RowSelection::from_consecutive_ranges(vec![0..2, 3..5].into_iter(), 5);
181+
182+
let reader = ArrowReaderBuilder::try_new(f)
183+
.unwrap()
184+
.with_row_selection(selection)
185+
.build();
186+
187+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
188+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
189+
190+
// Should read 4 rows (skip the middle one)
191+
assert_eq!(total_rows, 4);
192+
}
193+
194+
#[test]
195+
fn test_row_selection_with_projection() {
196+
// Test that row selection works with column projection
197+
let path = basic_path("test.orc");
198+
let f = File::open(path).expect("no file found");
199+
200+
let builder = ArrowReaderBuilder::try_new(f).unwrap();
201+
let projection =
202+
ProjectionMask::named_roots(builder.file_metadata().root_data_type(), &["a", "b"]);
203+
204+
let selection = vec![
205+
RowSelector::skip(1),
206+
RowSelector::select(2),
207+
RowSelector::skip(2),
208+
]
209+
.into();
210+
211+
let reader = builder
212+
.with_projection(projection)
213+
.with_row_selection(selection)
214+
.build();
215+
216+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
217+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
218+
219+
assert_eq!(total_rows, 2);
220+
assert_eq!(batches[0].num_columns(), 2); // Only 2 columns projected
221+
222+
let expected = [
223+
"+-----+-------+",
224+
"| a | b |",
225+
"+-----+-------+",
226+
"| 2.0 | false |",
227+
"| | |",
228+
"+-----+-------+",
229+
];
230+
assert_batches_eq(&batches, &expected);
231+
}
232+
233+
#[test]
234+
fn test_row_selection_with_nested_struct() {
235+
let path = basic_path("nested_struct.orc");
236+
let f = File::open(path).expect("no file found");
237+
238+
// Select first 2 rows and last row
239+
let selection = vec![
240+
RowSelector::select(2),
241+
RowSelector::skip(2),
242+
RowSelector::select(1),
243+
]
244+
.into();
245+
246+
let reader = ArrowReaderBuilder::try_new(f)
247+
.unwrap()
248+
.with_row_selection(selection)
249+
.build();
250+
251+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
252+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
253+
254+
assert_eq!(total_rows, 3);
255+
256+
let expected = [
257+
"+-------------------+",
258+
"| nest |",
259+
"+-------------------+",
260+
"| {a: 1.0, b: true} |",
261+
"| {a: 3.0, b: } |",
262+
"| {a: -3.0, b: } |",
263+
"+-------------------+",
264+
];
265+
assert_batches_eq(&batches, &expected);
266+
}
267+
268+
#[test]
269+
fn test_row_selection_with_nested_array() {
270+
let path = basic_path("nested_array.orc");
271+
let f = File::open(path).expect("no file found");
272+
273+
// Select middle rows (index 1-2)
274+
let selection = vec![
275+
RowSelector::skip(1),
276+
RowSelector::select(2),
277+
RowSelector::skip(2),
278+
]
279+
.into();
280+
281+
let reader = ArrowReaderBuilder::try_new(f)
282+
.unwrap()
283+
.with_row_selection(selection)
284+
.build();
285+
286+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
287+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
288+
289+
assert_eq!(total_rows, 2);
290+
291+
let expected = [
292+
"+--------------------+",
293+
"| value |",
294+
"+--------------------+",
295+
"| [5, , 32, 4, 15] |",
296+
"| [16, , 3, 4, 5, 6] |",
297+
"+--------------------+",
298+
];
299+
assert_batches_eq(&batches, &expected);
300+
}
301+
302+
#[test]
303+
fn test_row_selection_with_large_file() {
304+
// Test with a larger file that spans multiple stripes
305+
let path = basic_path("string_long_long.orc");
306+
let f = File::open(path).expect("no file found");
307+
308+
// Skip first 1000 rows, select next 500, skip rest
309+
let selection = vec![
310+
RowSelector::skip(1000),
311+
RowSelector::select(500),
312+
RowSelector::skip(8500),
313+
]
314+
.into();
315+
316+
let reader = ArrowReaderBuilder::try_new(f)
317+
.unwrap()
318+
.with_row_selection(selection)
319+
.build();
320+
321+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
322+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
323+
324+
assert_eq!(total_rows, 500);
325+
}
326+
327+
#[test]
328+
fn test_row_selection_empty_selection() {
329+
let path = basic_path("test.orc");
330+
let f = File::open(path).expect("no file found");
331+
332+
// Empty selection - skip all rows
333+
let selection = RowSelection::skip_all(5);
334+
335+
let reader = ArrowReaderBuilder::try_new(f)
336+
.unwrap()
337+
.with_row_selection(selection)
338+
.build();
339+
340+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
341+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
342+
343+
// Empty selection should read 0 rows
344+
assert_eq!(total_rows, 0);
345+
}
346+
347+
#[test]
348+
fn test_row_selection_with_compression() {
349+
// Test that row selection works with compressed files
350+
let path = basic_path("string_dict_gzip.orc");
351+
let f = File::open(path).expect("no file found");
352+
353+
let selection = vec![
354+
RowSelector::skip(10),
355+
RowSelector::select(20),
356+
RowSelector::skip(34),
357+
]
358+
.into();
359+
360+
let reader = ArrowReaderBuilder::try_new(f)
361+
.unwrap()
362+
.with_row_selection(selection)
363+
.build();
364+
365+
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
366+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
367+
368+
assert_eq!(total_rows, 20);
369+
}
370+
371+
// TODO: Async version doesn't support row_selection yet
372+
// Need to update async_arrow_reader.rs to pass row_selection to NaiveStripeDecoder
373+
// #[cfg(feature = "async")]
374+
// #[tokio::test]
375+
// async fn test_row_selection_async() {
376+
// let path = basic_path("test.orc");
377+
// let f = tokio::fs::File::open(path).await.unwrap();
378+
//
379+
// let selection = vec![
380+
// RowSelector::skip(1),
381+
// RowSelector::select(3),
382+
// RowSelector::skip(1),
383+
// ]
384+
// .into();
385+
//
386+
// let reader = ArrowReaderBuilder::try_new_async(f)
387+
// .await
388+
// .unwrap()
389+
// .with_row_selection(selection)
390+
// .build_async();
391+
//
392+
// let batches = reader.try_collect::<Vec<_>>().await.unwrap();
393+
// let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
394+
//
395+
// assert_eq!(total_rows, 3);
396+
// }
397+

0 commit comments

Comments
 (0)