Skip to content

Commit 40c7072

Browse files
authored
feat: Implement nested Parquet writing for High-Precision Decimals (#19476)
1 parent 687811d commit 40c7072

File tree

6 files changed

+157
-64
lines changed

6 files changed

+157
-64
lines changed

crates/polars-parquet/src/arrow/write/dictionary.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use polars_error::{polars_bail, PolarsResult};
1313
use super::binary::{
1414
build_statistics as binary_build_statistics, encode_plain as binary_encode_plain,
1515
};
16-
use super::fixed_len_bytes::{
16+
use super::fixed_size_binary::{
1717
build_statistics as fixed_binary_build_statistics, encode_plain as fixed_binary_encode_plain,
1818
};
1919
use super::pages::PrimitiveNested;
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
use arrow::array::{Array, FixedSizeBinaryArray};
2+
use polars_error::PolarsResult;
3+
4+
use super::encode_plain;
5+
use crate::parquet::page::DataPage;
6+
use crate::parquet::schema::types::PrimitiveType;
7+
use crate::parquet::statistics::FixedLenStatistics;
8+
use crate::read::schema::is_nullable;
9+
use crate::write::{utils, EncodeNullability, Encoding, WriteOptions};
10+
11+
pub fn array_to_page(
12+
array: &FixedSizeBinaryArray,
13+
options: WriteOptions,
14+
type_: PrimitiveType,
15+
statistics: Option<FixedLenStatistics>,
16+
) -> PolarsResult<DataPage> {
17+
let is_optional = is_nullable(&type_.field_info);
18+
let encode_options = EncodeNullability::new(is_optional);
19+
20+
let validity = array.validity();
21+
22+
let mut buffer = vec![];
23+
utils::write_def_levels(
24+
&mut buffer,
25+
is_optional,
26+
validity,
27+
array.len(),
28+
options.version,
29+
)?;
30+
31+
let definition_levels_byte_length = buffer.len();
32+
33+
encode_plain(array, encode_options, &mut buffer);
34+
35+
utils::build_plain_page(
36+
buffer,
37+
array.len(),
38+
array.len(),
39+
array.null_count(),
40+
0,
41+
definition_levels_byte_length,
42+
statistics.map(|x| x.serialize()),
43+
type_,
44+
options,
45+
Encoding::Plain,
46+
)
47+
}

crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs renamed to crates/polars-parquet/src/arrow/write/fixed_size_binary/mod.rs

Lines changed: 6 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1+
mod basic;
2+
mod nested;
3+
14
use arrow::array::{Array, FixedSizeBinaryArray, PrimitiveArray};
25
use arrow::types::i256;
3-
use polars_error::PolarsResult;
6+
pub use basic::array_to_page;
7+
pub use nested::array_to_page as nested_array_to_page;
48

59
use super::binary::ord_binary;
6-
use super::{utils, EncodeNullability, StatisticsOptions, WriteOptions};
7-
use crate::arrow::read::schema::is_nullable;
8-
use crate::parquet::encoding::Encoding;
9-
use crate::parquet::page::DataPage;
10+
use super::{EncodeNullability, StatisticsOptions};
1011
use crate::parquet::schema::types::PrimitiveType;
1112
use crate::parquet::statistics::FixedLenStatistics;
1213

@@ -27,44 +28,6 @@ pub(crate) fn encode_plain(
2728
}
2829
}
2930

30-
pub fn array_to_page(
31-
array: &FixedSizeBinaryArray,
32-
options: WriteOptions,
33-
type_: PrimitiveType,
34-
statistics: Option<FixedLenStatistics>,
35-
) -> PolarsResult<DataPage> {
36-
let is_optional = is_nullable(&type_.field_info);
37-
let encode_options = EncodeNullability::new(is_optional);
38-
39-
let validity = array.validity();
40-
41-
let mut buffer = vec![];
42-
utils::write_def_levels(
43-
&mut buffer,
44-
is_optional,
45-
validity,
46-
array.len(),
47-
options.version,
48-
)?;
49-
50-
let definition_levels_byte_length = buffer.len();
51-
52-
encode_plain(array, encode_options, &mut buffer);
53-
54-
utils::build_plain_page(
55-
buffer,
56-
array.len(),
57-
array.len(),
58-
array.null_count(),
59-
0,
60-
definition_levels_byte_length,
61-
statistics.map(|x| x.serialize()),
62-
type_,
63-
options,
64-
Encoding::Plain,
65-
)
66-
}
67-
6831
pub(super) fn build_statistics(
6932
array: &FixedSizeBinaryArray,
7033
primitive_type: PrimitiveType,
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
use arrow::array::{Array, FixedSizeBinaryArray};
2+
use polars_error::PolarsResult;
3+
4+
use super::encode_plain;
5+
use crate::parquet::page::DataPage;
6+
use crate::parquet::schema::types::PrimitiveType;
7+
use crate::parquet::statistics::FixedLenStatistics;
8+
use crate::read::schema::is_nullable;
9+
use crate::write::{nested, utils, EncodeNullability, Encoding, Nested, WriteOptions};
10+
11+
pub fn array_to_page(
12+
array: &FixedSizeBinaryArray,
13+
options: WriteOptions,
14+
type_: PrimitiveType,
15+
nested: &[Nested],
16+
statistics: Option<FixedLenStatistics>,
17+
) -> PolarsResult<DataPage> {
18+
let is_optional = is_nullable(&type_.field_info);
19+
let encode_options = EncodeNullability::new(is_optional);
20+
21+
let mut buffer = vec![];
22+
let (repetition_levels_byte_length, definition_levels_byte_length) =
23+
nested::write_rep_and_def(options.version, nested, &mut buffer)?;
24+
25+
encode_plain(array, encode_options, &mut buffer);
26+
27+
utils::build_plain_page(
28+
buffer,
29+
nested::num_values(nested),
30+
nested[0].len(),
31+
array.null_count(),
32+
repetition_levels_byte_length,
33+
definition_levels_byte_length,
34+
statistics.map(|x| x.serialize()),
35+
type_,
36+
options,
37+
Encoding::Plain,
38+
)
39+
}

crates/polars-parquet/src/arrow/write/mod.rs

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ mod binview;
1717
mod boolean;
1818
mod dictionary;
1919
mod file;
20-
mod fixed_len_bytes;
20+
mod fixed_size_binary;
2121
mod nested;
2222
mod pages;
2323
mod primitive;
@@ -528,15 +528,15 @@ pub fn array_to_page_simple(
528528
array.validity().cloned(),
529529
);
530530
let statistics = if options.has_statistics() {
531-
Some(fixed_len_bytes::build_statistics(
531+
Some(fixed_size_binary::build_statistics(
532532
&array,
533533
type_.clone(),
534534
&options.statistics,
535535
))
536536
} else {
537537
None
538538
};
539-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
539+
fixed_size_binary::array_to_page(&array, options, type_, statistics)
540540
},
541541
ArrowDataType::Interval(IntervalUnit::DayTime) => {
542542
let array = array
@@ -555,20 +555,20 @@ pub fn array_to_page_simple(
555555
array.validity().cloned(),
556556
);
557557
let statistics = if options.has_statistics() {
558-
Some(fixed_len_bytes::build_statistics(
558+
Some(fixed_size_binary::build_statistics(
559559
&array,
560560
type_.clone(),
561561
&options.statistics,
562562
))
563563
} else {
564564
None
565565
};
566-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
566+
fixed_size_binary::array_to_page(&array, options, type_, statistics)
567567
},
568568
ArrowDataType::FixedSizeBinary(_) => {
569569
let array = array.as_any().downcast_ref().unwrap();
570570
let statistics = if options.has_statistics() {
571-
Some(fixed_len_bytes::build_statistics(
571+
Some(fixed_size_binary::build_statistics(
572572
array,
573573
type_.clone(),
574574
&options.statistics,
@@ -577,7 +577,7 @@ pub fn array_to_page_simple(
577577
None
578578
};
579579

580-
fixed_len_bytes::array_to_page(array, options, type_, statistics)
580+
fixed_size_binary::array_to_page(array, options, type_, statistics)
581581
},
582582
ArrowDataType::Decimal256(precision, _) => {
583583
let precision = *precision;
@@ -620,7 +620,7 @@ pub fn array_to_page_simple(
620620
} else if precision <= 38 {
621621
let size = decimal_length_from_precision(precision);
622622
let statistics = if options.has_statistics() {
623-
let stats = fixed_len_bytes::build_statistics_decimal256_with_i128(
623+
let stats = fixed_size_binary::build_statistics_decimal256_with_i128(
624624
array,
625625
type_.clone(),
626626
size,
@@ -641,15 +641,15 @@ pub fn array_to_page_simple(
641641
values.into(),
642642
array.validity().cloned(),
643643
);
644-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
644+
fixed_size_binary::array_to_page(&array, options, type_, statistics)
645645
} else {
646646
let size = 32;
647647
let array = array
648648
.as_any()
649649
.downcast_ref::<PrimitiveArray<i256>>()
650650
.unwrap();
651651
let statistics = if options.has_statistics() {
652-
let stats = fixed_len_bytes::build_statistics_decimal256(
652+
let stats = fixed_size_binary::build_statistics_decimal256(
653653
array,
654654
type_.clone(),
655655
size,
@@ -670,7 +670,7 @@ pub fn array_to_page_simple(
670670
array.validity().cloned(),
671671
);
672672

673-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
673+
fixed_size_binary::array_to_page(&array, options, type_, statistics)
674674
}
675675
},
676676
ArrowDataType::Decimal(precision, _) => {
@@ -715,7 +715,7 @@ pub fn array_to_page_simple(
715715
let size = decimal_length_from_precision(precision);
716716

717717
let statistics = if options.has_statistics() {
718-
let stats = fixed_len_bytes::build_statistics_decimal(
718+
let stats = fixed_size_binary::build_statistics_decimal(
719719
array,
720720
type_.clone(),
721721
size,
@@ -736,7 +736,7 @@ pub fn array_to_page_simple(
736736
values.into(),
737737
array.validity().cloned(),
738738
);
739-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
739+
fixed_size_binary::array_to_page(&array, options, type_, statistics)
740740
}
741741
},
742742
other => polars_bail!(nyi = "Writing parquet pages for data type {other:?}"),
@@ -858,7 +858,7 @@ fn array_to_page_nested(
858858
let size = decimal_length_from_precision(precision);
859859

860860
let statistics = if options.has_statistics() {
861-
let stats = fixed_len_bytes::build_statistics_decimal(
861+
let stats = fixed_size_binary::build_statistics_decimal(
862862
array,
863863
type_.clone(),
864864
size,
@@ -879,7 +879,7 @@ fn array_to_page_nested(
879879
values.into(),
880880
array.validity().cloned(),
881881
);
882-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
882+
fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics)
883883
}
884884
},
885885
Decimal256(precision, _) => {
@@ -919,7 +919,7 @@ fn array_to_page_nested(
919919
} else if precision <= 38 {
920920
let size = decimal_length_from_precision(precision);
921921
let statistics = if options.has_statistics() {
922-
let stats = fixed_len_bytes::build_statistics_decimal256_with_i128(
922+
let stats = fixed_size_binary::build_statistics_decimal256_with_i128(
923923
array,
924924
type_.clone(),
925925
size,
@@ -940,15 +940,15 @@ fn array_to_page_nested(
940940
values.into(),
941941
array.validity().cloned(),
942942
);
943-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
943+
fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics)
944944
} else {
945945
let size = 32;
946946
let array = array
947947
.as_any()
948948
.downcast_ref::<PrimitiveArray<i256>>()
949949
.unwrap();
950950
let statistics = if options.has_statistics() {
951-
let stats = fixed_len_bytes::build_statistics_decimal256(
951+
let stats = fixed_size_binary::build_statistics_decimal256(
952952
array,
953953
type_.clone(),
954954
size,
@@ -969,7 +969,7 @@ fn array_to_page_nested(
969969
array.validity().cloned(),
970970
);
971971

972-
fixed_len_bytes::array_to_page(&array, options, type_, statistics)
972+
fixed_size_binary::nested_array_to_page(&array, options, type_, nested, statistics)
973973
}
974974
},
975975
other => polars_bail!(nyi = "Writing nested parquet pages for data type {other:?}"),

py-polars/tests/unit/io/test_parquet.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from __future__ import annotations
22

3+
import decimal
34
import io
45
from datetime import datetime, time, timezone
56
from decimal import Decimal
6-
from typing import IO, TYPE_CHECKING, Any, Literal, cast
7+
from typing import IO, TYPE_CHECKING, Any, Callable, Literal, cast
78

89
import fsspec
910
import numpy as np
@@ -1995,6 +1996,49 @@ def test_nested_nonnullable_19158() -> None:
19951996
assert_frame_equal(pl.read_parquet(f), pl.DataFrame(tbl))
19961997

19971998

1999+
D = Decimal
2000+
2001+
2002+
@pytest.mark.parametrize("precision", range(1, 37, 2))
2003+
@pytest.mark.parametrize(
2004+
"nesting",
2005+
[
2006+
# Struct
2007+
lambda t: ([{"x": None}, None], pl.Struct({"x": t})),
2008+
lambda t: ([None, {"x": None}], pl.Struct({"x": t})),
2009+
lambda t: ([{"x": D("1.5")}, None], pl.Struct({"x": t})),
2010+
lambda t: ([{"x": D("1.5")}, {"x": D("4.8")}], pl.Struct({"x": t})),
2011+
# Array
2012+
lambda t: ([[None, None, D("8.2")], None], pl.Array(t, 3)),
2013+
lambda t: ([None, [None, D("8.9"), None]], pl.Array(t, 3)),
2014+
lambda t: ([[D("1.5"), D("3.7"), D("4.1")], None], pl.Array(t, 3)),
2015+
lambda t: (
2016+
[[D("1.5"), D("3.7"), D("4.1")], [D("2.8"), D("5.2"), D("8.9")]],
2017+
pl.Array(t, 3),
2018+
),
2019+
# List
2020+
lambda t: ([[None, D("8.2")], None], pl.List(t)),
2021+
lambda t: ([None, [D("8.9"), None]], pl.List(t)),
2022+
lambda t: ([[D("1.5"), D("4.1")], None], pl.List(t)),
2023+
lambda t: ([[D("1.5"), D("3.7"), D("4.1")], [D("2.8"), D("8.9")]], pl.List(t)),
2024+
],
2025+
)
2026+
def test_decimal_precision_nested_roundtrip(
2027+
nesting: Callable[[pl.DataType], tuple[list[Any], pl.DataType]],
2028+
precision: int,
2029+
) -> None:
2030+
# Limit the context as to not disturb any other tests
2031+
with decimal.localcontext() as ctx:
2032+
ctx.prec = precision
2033+
2034+
decimal_dtype = pl.Decimal(precision=precision)
2035+
values, dtype = nesting(decimal_dtype)
2036+
2037+
df = pl.Series("a", values, dtype).to_frame()
2038+
2039+
test_round_trip(df)
2040+
2041+
19982042
@pytest.mark.parametrize("parallel", ["prefiltered", "columns", "row_groups", "auto"])
19992043
def test_conserve_sortedness(
20002044
monkeypatch: Any, capfd: Any, parallel: pl.ParallelStrategy

0 commit comments

Comments
 (0)