@@ -35,6 +35,7 @@ mod adapter;
35
35
mod deserialize;
36
36
37
37
pub use adapter:: RowGroupImplBuilder ;
38
+ use databend_common_exception:: Result ;
38
39
pub use deserialize:: column_chunks_to_record_batch;
39
40
40
41
use crate :: io:: read:: block:: block_reader_merge_io:: DataItem ;
@@ -48,17 +49,41 @@ impl BlockReader {
48
49
column_chunks : HashMap < ColumnId , DataItem > ,
49
50
compression : & Compression ,
50
51
block_path : & str ,
51
- ) -> databend_common_exception:: Result < DataBlock > {
52
+ ) -> Result < DataBlock > {
53
+ let mut blocks = self . deserialize_parquet_to_blocks (
54
+ num_rows,
55
+ column_metas,
56
+ column_chunks,
57
+ compression,
58
+ block_path,
59
+ num_rows,
60
+ ) ?;
61
+ // Defensive check: using `num_rows` as batch_size, expects only one block
62
+ assert_eq ! ( blocks. len( ) , 1 ) ;
63
+ Ok ( blocks. pop ( ) . unwrap ( ) )
64
+ }
65
+
66
+ pub ( crate ) fn deserialize_parquet_to_blocks (
67
+ & self ,
68
+ num_rows : usize ,
69
+ column_metas : & HashMap < ColumnId , ColumnMeta > ,
70
+ column_chunks : HashMap < ColumnId , DataItem > ,
71
+ compression : & Compression ,
72
+ block_path : & str ,
73
+ batch_size : usize ,
74
+ ) -> Result < Vec < DataBlock > > {
52
75
if column_chunks. is_empty ( ) {
53
- return self . build_default_values_block ( num_rows) ;
76
+ return Ok ( vec ! [ self . build_default_values_block( num_rows) ? ] ) ;
54
77
}
55
- let record_batch = column_chunks_to_record_batch (
78
+
79
+ let record_batches = column_chunks_to_record_batch (
56
80
& self . original_schema ,
57
81
num_rows,
58
82
& column_chunks,
59
83
compression,
84
+ batch_size,
60
85
) ?;
61
- let mut columns = Vec :: with_capacity ( self . projected_schema . fields . len ( ) ) ;
86
+
62
87
let name_paths = column_name_paths ( & self . projection , & self . original_schema ) ;
63
88
64
89
let array_cache = if self . put_cache {
@@ -67,58 +92,71 @@ impl BlockReader {
67
92
None
68
93
} ;
69
94
70
- for ( ( i, field) , column_node) in self
71
- . projected_schema
72
- . fields
73
- . iter ( )
74
- . enumerate ( )
75
- . zip ( self . project_column_nodes . iter ( ) )
76
- {
77
- let data_type = field. data_type ( ) . into ( ) ;
78
-
79
- // NOTE, there is something tricky here:
80
- // - `column_chunks` always contains data of leaf columns
81
- // - here we may processing a nested type field
82
- // - But, even if the field being processed is a field with multiple leaf columns
83
- // `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
84
- // even if we are getting data from `column_chunks` using a non-leaf
85
- // `column_id` of `projected_schema.fields`
86
- //
87
- // [^1]: Except in the current block, there is no data stored for the
88
- // corresponding field, and a default value has been declared for
89
- // the corresponding field.
90
- //
91
- // Yes, it is too obscure, we need to polish it later.
92
-
93
- let value = match column_chunks. get ( & field. column_id ) {
94
- Some ( DataItem :: RawData ( data) ) => {
95
- // get the deserialized arrow array, which may be a nested array
96
- let arrow_array = column_by_name ( & record_batch, & name_paths[ i] ) ;
97
- if !column_node. is_nested {
98
- if let Some ( cache) = & array_cache {
99
- let meta = column_metas. get ( & field. column_id ) . unwrap ( ) ;
100
- let ( offset, len) = meta. offset_length ( ) ;
101
- let key =
102
- TableDataCacheKey :: new ( block_path, field. column_id , offset, len) ;
103
- cache. insert ( key. into ( ) , ( arrow_array. clone ( ) , data. len ( ) ) ) ;
95
+ let mut blocks = Vec :: with_capacity ( record_batches. len ( ) ) ;
96
+
97
+ for record_batch in record_batches {
98
+ let num_rows_record_batch = record_batch. num_rows ( ) ;
99
+ let mut columns = Vec :: with_capacity ( self . projected_schema . fields . len ( ) ) ;
100
+ for ( ( i, field) , column_node) in self
101
+ . projected_schema
102
+ . fields
103
+ . iter ( )
104
+ . enumerate ( )
105
+ . zip ( self . project_column_nodes . iter ( ) )
106
+ {
107
+ let data_type = field. data_type ( ) . into ( ) ;
108
+
109
+ // NOTE, there is something tricky here:
110
+ // - `column_chunks` always contains data of leaf columns
111
+ // - here we may processing a nested type field
112
+ // - But, even if the field being processed is a field with multiple leaf columns
113
+ // `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
114
+ // even if we are getting data from `column_chunks` using a non-leaf
115
+ // `column_id` of `projected_schema.fields`
116
+ //
117
+ // [^1]: Except in the current block, there is no data stored for the
118
+ // corresponding field, and a default value has been declared for
119
+ // the corresponding field.
120
+ //
121
+ // Yes, it is too obscure, we need to polish it later.
122
+
123
+ let value = match column_chunks. get ( & field. column_id ) {
124
+ Some ( DataItem :: RawData ( data) ) => {
125
+ // get the deserialized arrow array, which may be a nested array
126
+ let arrow_array = column_by_name ( & record_batch, & name_paths[ i] ) ;
127
+ if !column_node. is_nested {
128
+ if let Some ( cache) = & array_cache {
129
+ let meta = column_metas. get ( & field. column_id ) . unwrap ( ) ;
130
+ let ( offset, len) = meta. offset_length ( ) ;
131
+ let key = TableDataCacheKey :: new (
132
+ block_path,
133
+ field. column_id ,
134
+ offset,
135
+ len,
136
+ ) ;
137
+ cache. insert ( key. into ( ) , ( arrow_array. clone ( ) , data. len ( ) ) ) ;
138
+ }
104
139
}
140
+ Value :: from_arrow_rs ( arrow_array, & data_type) ?
105
141
}
106
- Value :: from_arrow_rs ( arrow_array, & data_type) ?
107
- }
108
- Some ( DataItem :: ColumnArray ( cached) ) => {
109
- if column_node. is_nested {
110
- // a defensive check, should never happen
111
- return Err ( ErrorCode :: StorageOther (
112
- "unexpected nested field: nested leaf field hits cached" ,
113
- ) ) ;
142
+ Some ( DataItem :: ColumnArray ( cached) ) => {
143
+ // TODO this is NOT correct!
144
+ if column_node. is_nested {
145
+ // a defensive check, should never happen
146
+ return Err ( ErrorCode :: StorageOther (
147
+ "unexpected nested field: nested leaf field hits cached" ,
148
+ ) ) ;
149
+ }
150
+ Value :: from_arrow_rs ( cached. 0 . clone ( ) , & data_type) ?
114
151
}
115
- Value :: from_arrow_rs ( cached . 0 . clone ( ) , & data_type ) ?
116
- }
117
- None => Value :: Scalar ( self . default_vals [ i ] . clone ( ) ) ,
118
- } ;
119
- columns . push ( BlockEntry :: new ( data_type , value ) ) ;
152
+ None => Value :: Scalar ( self . default_vals [ i ] . clone ( ) ) ,
153
+ } ;
154
+ columns . push ( BlockEntry :: new ( data_type , value ) ) ;
155
+ }
156
+ blocks . push ( DataBlock :: new ( columns , num_rows_record_batch ) ) ;
120
157
}
121
- Ok ( DataBlock :: new ( columns, num_rows) )
158
+
159
+ Ok ( blocks)
122
160
}
123
161
}
124
162
0 commit comments