Skip to content

Commit 6891fec

Browse files
fix: Fix panic reading empty parquet with multiple boolean columns (#23159)
1 parent 7d11a0b commit 6891fec

File tree

2 files changed

+36
-13
lines changed

2 files changed

+36
-13
lines changed

crates/polars-io/src/cloud/polars_object_store.rs

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ use std::ops::Range;
22

33
use bytes::Bytes;
44
use futures::{StreamExt, TryStreamExt};
5+
use hashbrown::hash_map::RawEntryMut;
56
use object_store::path::Path;
67
use object_store::{ObjectMeta, ObjectStore};
78
use polars_core::prelude::{InitHashMaps, PlHashMap};
89
use polars_error::{PolarsError, PolarsResult};
10+
use polars_utils::mmap::MemSlice;
911
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
1012

1113
use crate::pl_async::{
@@ -214,14 +216,11 @@ impl PolarsObjectStore {
214216
///
215217
/// # Panics
216218
/// Panics if the same range start is used by more than 1 range.
217-
pub async fn get_ranges_sort<
218-
K: TryFrom<usize, Error = impl std::fmt::Debug> + std::hash::Hash + Eq,
219-
T: From<Bytes>,
220-
>(
219+
pub async fn get_ranges_sort(
221220
&self,
222221
path: &Path,
223222
ranges: &mut [Range<usize>],
224-
) -> PolarsResult<PlHashMap<K, T>> {
223+
) -> PolarsResult<PlHashMap<usize, MemSlice>> {
225224
if ranges.is_empty() {
226225
return Ok(Default::default());
227226
}
@@ -280,16 +279,23 @@ impl PolarsObjectStore {
280279

281280
assert_eq!(bytes.len(), full_range.len());
282281

282+
let bytes = MemSlice::from_bytes(bytes);
283+
283284
for range in &ranges[current_offset..end] {
284-
let v = out.insert(
285-
K::try_from(range.start).unwrap(),
286-
T::from(bytes.slice(
287-
range.start - full_range.start
288-
..range.end - full_range.start,
289-
)),
285+
let mem_slice = bytes.slice(
286+
range.start - full_range.start..range.end - full_range.start,
290287
);
291288

292-
assert!(v.is_none()); // duplicate range start
289+
match out.raw_entry_mut().from_key(&range.start) {
290+
RawEntryMut::Vacant(slot) => {
291+
slot.insert(range.start, mem_slice);
292+
},
293+
RawEntryMut::Occupied(mut slot) => {
294+
if slot.get_mut().len() < mem_slice.len() {
295+
*slot.get_mut() = mem_slice;
296+
}
297+
},
298+
}
293299
}
294300

295301
current_offset = end;

py-polars/tests/unit/io/test_parquet.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3281,7 +3281,7 @@ def test_parquet_read_timezone_22506() -> None:
32813281

32823282
@pytest.mark.parametrize("static", [True, False])
32833283
@pytest.mark.parametrize("lazy", [True, False])
3284-
def test_read_write_metadata(tmp_path: Path, static: bool, lazy: bool) -> None:
3284+
def test_read_write_metadata(static: bool, lazy: bool) -> None:
32853285
metadata = {"hello": "world", "something": "else"}
32863286
md: ParquetMetadata = metadata
32873287
if not static:
@@ -3378,3 +3378,20 @@ def multiple_test_sorting_columns() -> None:
33783378
assert roundtrip.get_column("a").is_sorted()
33793379
assert not roundtrip.get_column("b").is_sorted()
33803380
assert_frame_equal(roundtrip.sort("b"), df.sort("b"))
3381+
3382+
3383+
@pytest.mark.write_disk
3384+
def test_read_parquet_duplicate_range_start_fetch_23139(tmp_path: Path) -> None:
3385+
tmp_path.mkdir(exist_ok=True)
3386+
path = tmp_path / "data.parquet"
3387+
3388+
df = pl.DataFrame(
3389+
schema={
3390+
"a": pl.Boolean,
3391+
"b": pl.Boolean,
3392+
}
3393+
)
3394+
3395+
df.write_parquet(path, use_pyarrow=True)
3396+
3397+
assert_frame_equal(pl.read_parquet(path), df)

0 commit comments

Comments
 (0)