Skip to content

Commit 0e5e554

Browse files
authored
fix: Parquet prefiltered with projection pushdown (#18714)
1 parent ca383a0 commit 0e5e554

File tree

2 files changed

+23
-0
lines changed

2 files changed

+23
-0
lines changed

crates/polars-io/src/parquet/read/read_impl.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,11 +266,21 @@ fn rg_to_dfs_prefiltered(
266266
let num_live_columns = live_variables.len();
267267
let num_dead_columns = projection.len() - num_live_columns;
268268

269+
// @NOTE: This is probably already sorted, but just to be sure.
270+
let mut projection_sorted = projection.to_vec();
271+
projection_sorted.sort();
272+
269273
// We create two look-up tables that map indexes offsets into the live- and dead-set onto
270274
// column indexes of the schema.
271275
let mut live_idx_to_col_idx = Vec::with_capacity(num_live_columns);
272276
let mut dead_idx_to_col_idx = Vec::with_capacity(num_dead_columns);
277+
let mut offset = 0;
273278
for (i, field) in schema.iter_values().enumerate() {
279+
if projection_sorted.get(offset).copied() != Some(i) {
280+
continue;
281+
}
282+
283+
offset += 1;
274284
if live_variables.contains(&field.name[..]) {
275285
live_idx_to_col_idx.push(i);
276286
} else {

py-polars/tests/unit/io/test_parquet.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1904,3 +1904,16 @@ def test_write_binary_open_file(tmp_path: Path) -> None:
19041904

19051905
out = pl.read_parquet(path)
19061906
assert_frame_equal(out, df)
1907+
1908+
1909+
def test_prefilter_with_projection() -> None:
1910+
f = io.BytesIO()
1911+
pl.DataFrame({"a": [1], "b": [2]}).write_parquet(f)
1912+
1913+
f.seek(0)
1914+
(
1915+
pl.scan_parquet(f, parallel="prefiltered")
1916+
.filter(pl.col.a == 1)
1917+
.select(pl.col.a)
1918+
.collect()
1919+
)

0 commit comments

Comments
 (0)