19
19
package org .apache .iceberg .spark .source ;
20
20
21
21
import java .util .Map ;
22
- import java . util . Set ;
22
+ import org . apache . iceberg . ContentScanTask ;
23
23
import org .apache .iceberg .FileFormat ;
24
- import org .apache .iceberg .MetadataColumns ;
25
24
import org .apache .iceberg .ScanTask ;
26
25
import org .apache .iceberg .ScanTaskGroup ;
27
26
import org .apache .iceberg .Schema ;
28
27
import org .apache .iceberg .Table ;
29
- import org .apache .iceberg .expressions .Expression ;
30
28
import org .apache .iceberg .io .CloseableIterable ;
31
29
import org .apache .iceberg .io .InputFile ;
30
+ import org .apache .iceberg .io .datafile .DataFileServiceRegistry ;
31
+ import org .apache .iceberg .io .datafile .DeleteFilter ;
32
+ import org .apache .iceberg .io .datafile .ReaderBuilder ;
33
+ import org .apache .iceberg .io .datafile .ReaderService ;
34
+ import org .apache .iceberg .io .datafile .ServiceBase ;
32
35
import org .apache .iceberg .orc .ORC ;
33
36
import org .apache .iceberg .parquet .Parquet ;
34
- import org .apache .iceberg .relocated .com .google .common .collect .Sets ;
35
37
import org .apache .iceberg .spark .OrcBatchReadConf ;
36
38
import org .apache .iceberg .spark .ParquetBatchReadConf ;
37
39
import org .apache .iceberg .spark .ParquetReaderType ;
38
40
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkOrcReaders ;
39
41
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkParquetReaders ;
40
- import org .apache .iceberg .types .TypeUtil ;
42
+ import org .apache .iceberg .types .Types ;
43
+ import org .apache .spark .sql .catalyst .InternalRow ;
41
44
import org .apache .spark .sql .vectorized .ColumnarBatch ;
42
45
43
46
abstract class BaseBatchReader <T extends ScanTask > extends BaseReader <ColumnarBatch , T > {
@@ -59,82 +62,111 @@ abstract class BaseBatchReader<T extends ScanTask> extends BaseReader<ColumnarBa
59
62
60
63
protected CloseableIterable <ColumnarBatch > newBatchIterable (
61
64
InputFile inputFile ,
62
- FileFormat format ,
63
- long start ,
64
- long length ,
65
- Expression residual ,
66
- Map <Integer , ?> idToConstant ,
65
+ ContentScanTask <?> task ,
66
+ Types .StructType unifiedPartitionType ,
67
67
SparkDeleteFilter deleteFilter ) {
68
- switch (format ) {
69
- case PARQUET :
70
- return newParquetIterable (inputFile , start , length , residual , idToConstant , deleteFilter );
68
+ ReaderBuilder <?> readerBuilder =
69
+ DataFileServiceRegistry .read (
70
+ task .file ().format (),
71
+ InternalRow .class .getName (),
72
+ parquetConf != null ? parquetConf .readerType ().name () : null ,
73
+ inputFile ,
74
+ task ,
75
+ expectedSchema (),
76
+ unifiedPartitionType ,
77
+ deleteFilter )
78
+ .split (task .start (), task .length ())
79
+ .filter (task .residual ())
80
+ .caseSensitive (caseSensitive ())
81
+ // Spark eagerly consumes the batches. So the underlying memory allocated could be
82
+ // reused
83
+ // without worrying about subsequent reads clobbering over each other. This improves
84
+ // read performance as every batch read doesn't have to pay the cost of allocating
85
+ // memory.
86
+ .reuseContainers ()
87
+ .withNameMapping (nameMapping ());
88
+ if (parquetConf != null ) {
89
+ readerBuilder = readerBuilder .recordsPerBatch (parquetConf .batchSize ());
90
+ } else if (orcConf != null ) {
91
+ readerBuilder = readerBuilder .recordsPerBatch (orcConf .batchSize ());
92
+ }
93
+
94
+ return readerBuilder .build ();
95
+ }
71
96
72
- case ORC :
73
- return newOrcIterable (inputFile , start , length , residual , idToConstant );
97
+ public static class IcebergParquetReaderService extends ServiceBase implements ReaderService {
98
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
99
+ public IcebergParquetReaderService () {
100
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .ICEBERG .name ());
101
+ }
74
102
75
- default :
76
- throw new UnsupportedOperationException (
77
- "Format: " + format + " not supported for batched reads" );
103
+ @ Override
104
+ public ReaderBuilder <?> builder (
105
+ InputFile inputFile ,
106
+ ContentScanTask <?> task ,
107
+ Schema readSchema ,
108
+ Types .StructType unifiedPartitionType ,
109
+ DeleteFilter <?> deleteFilter ) {
110
+ // get required schema if there are deletes
111
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
112
+ return Parquet .read (inputFile )
113
+ .project (requiredSchema )
114
+ .createBatchedReaderFunc (
115
+ fileSchema ->
116
+ VectorizedSparkParquetReaders .buildReader (
117
+ requiredSchema ,
118
+ fileSchema ,
119
+ constantsMap (task , readSchema , unifiedPartitionType ),
120
+ (DeleteFilter <InternalRow >) deleteFilter ));
78
121
}
79
122
}
80
123
81
- private CloseableIterable <ColumnarBatch > newParquetIterable (
82
- InputFile inputFile ,
83
- long start ,
84
- long length ,
85
- Expression residual ,
86
- Map <Integer , ?> idToConstant ,
87
- SparkDeleteFilter deleteFilter ) {
88
- // get required schema if there are deletes
89
- Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : expectedSchema ();
124
+ public static class CometParquetReaderService extends ServiceBase implements ReaderService {
125
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
126
+ public CometParquetReaderService () {
127
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .COMET .name ());
128
+ }
90
129
91
- return Parquet .read (inputFile )
92
- .project (requiredSchema )
93
- .split (start , length )
94
- .createBatchedReaderFunc (
95
- fileSchema -> {
96
- if (parquetConf .readerType () == ParquetReaderType .COMET ) {
97
- return VectorizedSparkParquetReaders .buildCometReader (
98
- requiredSchema , fileSchema , idToConstant , deleteFilter );
99
- } else {
100
- return VectorizedSparkParquetReaders .buildReader (
101
- requiredSchema , fileSchema , idToConstant , deleteFilter );
102
- }
103
- })
104
- .recordsPerBatch (parquetConf .batchSize ())
105
- .filter (residual )
106
- .caseSensitive (caseSensitive ())
107
- // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
108
- // without worrying about subsequent reads clobbering over each other. This improves
109
- // read performance as every batch read doesn't have to pay the cost of allocating memory.
110
- .reuseContainers ()
111
- .withNameMapping (nameMapping ())
112
- .build ();
130
+ @ Override
131
+ public ReaderBuilder <?> builder (
132
+ InputFile inputFile ,
133
+ ContentScanTask <?> task ,
134
+ Schema readSchema ,
135
+ Types .StructType unifiedPartitionType ,
136
+ DeleteFilter <?> deleteFilter ) {
137
+ // get required schema if there are deletes
138
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
139
+ return Parquet .read (inputFile )
140
+ .project (requiredSchema )
141
+ .createBatchedReaderFunc (
142
+ fileSchema ->
143
+ VectorizedSparkParquetReaders .buildCometReader (
144
+ requiredSchema ,
145
+ fileSchema ,
146
+ constantsMap (task , readSchema , unifiedPartitionType ),
147
+ (DeleteFilter <InternalRow >) deleteFilter ));
148
+ }
113
149
}
114
150
115
- private CloseableIterable <ColumnarBatch > newOrcIterable (
116
- InputFile inputFile ,
117
- long start ,
118
- long length ,
119
- Expression residual ,
120
- Map <Integer , ?> idToConstant ) {
121
- Set <Integer > constantFieldIds = idToConstant .keySet ();
122
- Set <Integer > metadataFieldIds = MetadataColumns .metadataFieldIds ();
123
- Sets .SetView <Integer > constantAndMetadataFieldIds =
124
- Sets .union (constantFieldIds , metadataFieldIds );
125
- Schema schemaWithoutConstantAndMetadataFields =
126
- TypeUtil .selectNot (expectedSchema (), constantAndMetadataFieldIds );
151
+ public static class ORCReaderService extends ServiceBase implements ReaderService {
152
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
153
+ public ORCReaderService () {
154
+ super (FileFormat .ORC , InternalRow .class .getName ());
155
+ }
127
156
128
- return ORC .read (inputFile )
129
- .project (schemaWithoutConstantAndMetadataFields )
130
- .split (start , length )
131
- .createBatchedReaderFunc (
132
- fileSchema ->
133
- VectorizedSparkOrcReaders .buildReader (expectedSchema (), fileSchema , idToConstant ))
134
- .recordsPerBatch (orcConf .batchSize ())
135
- .filter (residual )
136
- .caseSensitive (caseSensitive ())
137
- .withNameMapping (nameMapping ())
138
- .build ();
157
+ @ Override
158
+ public ReaderBuilder <?> builder (
159
+ InputFile inputFile ,
160
+ ContentScanTask <?> task ,
161
+ Schema readSchema ,
162
+ Types .StructType unifiedPartitionType ,
163
+ DeleteFilter <?> deleteFilter ) {
164
+ Map <Integer , ?> idToConstant = constantsMap (task , readSchema , unifiedPartitionType );
165
+ return ORC .read (inputFile )
166
+ .project (ORC .schemaWithoutConstantAndMetadataFields (readSchema , idToConstant ))
167
+ .createBatchedReaderFunc (
168
+ fileSchema ->
169
+ VectorizedSparkOrcReaders .buildReader (readSchema , fileSchema , idToConstant ));
170
+ }
139
171
}
140
172
}
0 commit comments