Skip to content

Commit 609ac98

Browse files
authored
fix(rust, python): fix freeze/stall when writing more than 2^31 string values to parquet (#5366)
1 parent 1f45cb4 commit 609ac98

File tree

9 files changed

+92
-46
lines changed

9 files changed

+92
-46
lines changed

Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ bitflags = "1.3"
3232
[workspace.dependencies.arrow]
3333
package = "arrow2"
3434
# git = "https://github.com/jorgecarleitao/arrow2"
35-
# git = "https://github.com/ritchie46/arrow2"
36-
# rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd"
35+
git = "https://github.com/ritchie46/arrow2"
36+
# rev = "e106cff24dc0c8942603712d7332a97871dce44e"
3737
# path = "../../../arrow2"
38-
# branch = "comparison_and_validity"
38+
branch = "2022_11_06"
3939
version = "0.14.1"
4040
default-features = false
4141
features = [

polars/polars-arrow/src/kernels/list.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,7 @@ mod test {
140140

141141
let out = sublist_get_indexes(&arr, 1);
142142
assert_eq!(
143-
out.into_iter()
144-
.map(|opt_v| opt_v.cloned())
145-
.collect::<Vec<_>>(),
143+
out.into_iter().collect::<Vec<_>>(),
146144
&[None, None, None, Some(4), Some(7), Some(10)]
147145
);
148146
}

polars/polars-arrow/src/kernels/rolling/nulls/min_max.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use arrow::bitmap::utils::{count_zeros, zip_validity};
1+
use arrow::bitmap::utils::{count_zeros, ZipValidityIter};
22
use nulls;
33
use nulls::{rolling_apply_agg_window, RollingAggWindowNulls};
44

@@ -9,7 +9,7 @@ pub fn is_reverse_sorted_max_nulls<T: NativeType + PartialOrd + IsFloat>(
99
validity: &Bitmap,
1010
) -> bool {
1111
let mut current_max = None;
12-
for opt_v in zip_validity(values.iter(), Some(validity.iter())) {
12+
for opt_v in ZipValidityIter::new(values.iter(), validity.iter()) {
1313
match (current_max, opt_v) {
1414
// do nothing
1515
(None, None) => {}

polars/polars-arrow/src/trusted_len/mod.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ mod rev;
55
use std::iter::Scan;
66
use std::slice::Iter;
77

8-
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
8+
use arrow::bitmap::utils::{BitmapIter, ZipValidity, ZipValidityIter};
99
pub use push_unchecked::*;
1010
pub use rev::FromIteratorReversed;
1111

@@ -66,7 +66,14 @@ unsafe impl<I: Iterator<Item = J>, J> TrustedLen for TrustMyLength<I, J> {}
6666
unsafe impl<T> TrustedLen for std::ops::Range<T> where std::ops::Range<T>: Iterator {}
6767
unsafe impl TrustedLen for arrow::array::Utf8ValuesIter<'_, i64> {}
6868
unsafe impl TrustedLen for arrow::array::BinaryValueIter<'_, i64> {}
69-
unsafe impl<T, I: TrustedLen + Iterator<Item = T>> TrustedLen for ZipValidity<'_, T, I> {}
69+
unsafe impl<T, I: TrustedLen + Iterator<Item = T>, V: TrustedLen + Iterator<Item = bool>> TrustedLen
70+
for ZipValidityIter<T, I, V>
71+
{
72+
}
73+
unsafe impl<T, I: TrustedLen + Iterator<Item = T>, V: TrustedLen + Iterator<Item = bool>> TrustedLen
74+
for ZipValidity<T, I, V>
75+
{
76+
}
7077
unsafe impl TrustedLen for BitmapIter<'_> {}
7178
unsafe impl<A: TrustedLen> TrustedLen for std::iter::StepBy<A> {}
7279

polars/polars-core/src/chunked_array/iterator/mod.rs

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -306,18 +306,36 @@ impl<'a> IntoIterator for &'a ListChunked {
306306
fn into_iter(self) -> Self::IntoIter {
307307
let dtype = self.inner_dtype();
308308

309-
// we know that we only iterate over length == self.len()
310-
unsafe {
311-
Box::new(
312-
self.downcast_iter()
313-
.flat_map(|arr| arr.iter())
314-
.trust_my_length(self.len())
315-
.map(move |arr| {
316-
arr.map(|arr| {
317-
Series::from_chunks_and_dtype_unchecked("", vec![arr], &dtype)
318-
})
319-
}),
320-
)
309+
if self.null_count() == 0 {
310+
// we know that we only iterate over length == self.len()
311+
unsafe {
312+
Box::new(
313+
self.downcast_iter()
314+
.flat_map(|arr| arr.iter().unwrap_required())
315+
.trust_my_length(self.len())
316+
.map(move |arr| {
317+
Some(Series::from_chunks_and_dtype_unchecked(
318+
"",
319+
vec![arr],
320+
&dtype,
321+
))
322+
}),
323+
)
324+
}
325+
} else {
326+
// we know that we only iterate over length == self.len()
327+
unsafe {
328+
Box::new(
329+
self.downcast_iter()
330+
.flat_map(|arr| arr.iter())
331+
.trust_my_length(self.len())
332+
.map(move |arr| {
333+
arr.map(|arr| {
334+
Series::from_chunks_and_dtype_unchecked("", vec![arr], &dtype)
335+
})
336+
}),
337+
)
338+
}
321339
}
322340
}
323341
}

polars/polars-core/src/chunked_array/ops/apply.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,7 @@ impl<'a> ChunkApply<'a, Series, Series> for ListChunked {
652652
});
653653
f(x)
654654
});
655-
let len = values.len();
655+
let len = array.len();
656656

657657
// we know the iterators len
658658
unsafe {

polars/polars-core/src/series/iterator.rs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,21 +105,34 @@ impl Series {
105105
})
106106
}
107107
} else {
108-
// TODO! null_count paths, but first exactsize iters must be implmeneted upstream
109108
match dtype {
110109
DataType::Utf8 => {
111110
let arr = arr.as_any().downcast_ref::<Utf8Array<i64>>().unwrap();
112-
Box::new(arr.iter().map(|value| match value {
113-
Some(value) => AnyValue::Utf8(value),
114-
None => AnyValue::Null,
115-
})) as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
111+
if arr.null_count() == 0 {
112+
Box::new(arr.values_iter().map(AnyValue::Utf8))
113+
as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
114+
} else {
115+
let zipvalid = arr.iter();
116+
Box::new(zipvalid.unwrap_optional().map(|v| match v {
117+
Some(value) => AnyValue::Utf8(value),
118+
None => AnyValue::Null,
119+
}))
120+
as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
121+
}
116122
}
117123
DataType::Boolean => {
118124
let arr = arr.as_any().downcast_ref::<BooleanArray>().unwrap();
119-
Box::new(arr.iter().map(|value| match value {
120-
Some(value) => AnyValue::Boolean(value),
121-
None => AnyValue::Null,
122-
})) as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
125+
if arr.null_count() == 0 {
126+
Box::new(arr.values_iter().map(AnyValue::Boolean))
127+
as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
128+
} else {
129+
let zipvalid = arr.iter();
130+
Box::new(zipvalid.unwrap_optional().map(|v| match v {
131+
Some(value) => AnyValue::Boolean(value),
132+
None => AnyValue::Null,
133+
}))
134+
as Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_>
135+
}
123136
}
124137
_ => Box::new(self.iter()),
125138
}

polars/polars-ops/src/chunked_array/nan_propagating_aggregate.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ where
4343
arr.values().iter().copied().fold_first_(min_or_max_fn)
4444
} else {
4545
arr.iter()
46+
.unwrap_optional()
4647
.map(|opt| opt.copied())
4748
.fold_first_(|a, b| match (a, b) {
4849
(Some(a), Some(b)) => Some(min_or_max_fn(a, b)),

py-polars/Cargo.lock

Lines changed: 23 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)