19
19
package org .apache .iceberg .spark .source ;
20
20
21
21
import java .util .Map ;
22
- import java . util . Set ;
22
+ import org . apache . iceberg . ContentScanTask ;
23
23
import org .apache .iceberg .FileFormat ;
24
- import org .apache .iceberg .MetadataColumns ;
25
24
import org .apache .iceberg .ScanTask ;
26
25
import org .apache .iceberg .ScanTaskGroup ;
27
26
import org .apache .iceberg .Schema ;
28
27
import org .apache .iceberg .Table ;
29
- import org .apache .iceberg .expressions .Expression ;
30
28
import org .apache .iceberg .io .CloseableIterable ;
31
29
import org .apache .iceberg .io .InputFile ;
30
+ import org .apache .iceberg .io .datafile .DataFileServiceRegistry ;
31
+ import org .apache .iceberg .io .datafile .DeleteFilter ;
32
+ import org .apache .iceberg .io .datafile .ReaderBuilder ;
33
+ import org .apache .iceberg .io .datafile .ServiceBase ;
32
34
import org .apache .iceberg .orc .ORC ;
33
35
import org .apache .iceberg .parquet .Parquet ;
34
- import org .apache .iceberg .relocated .com .google .common .collect .Sets ;
35
36
import org .apache .iceberg .spark .OrcBatchReadConf ;
36
37
import org .apache .iceberg .spark .ParquetBatchReadConf ;
37
38
import org .apache .iceberg .spark .ParquetReaderType ;
38
39
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkOrcReaders ;
39
40
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkParquetReaders ;
40
- import org .apache .iceberg .types .TypeUtil ;
41
+ import org .apache .iceberg .types .Types ;
42
+ import org .apache .spark .sql .catalyst .InternalRow ;
41
43
import org .apache .spark .sql .vectorized .ColumnarBatch ;
42
44
43
45
abstract class BaseBatchReader <T extends ScanTask > extends BaseReader <ColumnarBatch , T > {
@@ -59,82 +61,114 @@ abstract class BaseBatchReader<T extends ScanTask> extends BaseReader<ColumnarBa
59
61
60
62
protected CloseableIterable <ColumnarBatch > newBatchIterable (
61
63
InputFile inputFile ,
62
- FileFormat format ,
63
- long start ,
64
- long length ,
65
- Expression residual ,
66
- Map <Integer , ?> idToConstant ,
64
+ ContentScanTask <?> task ,
65
+ Types .StructType unifiedPartitionType ,
67
66
SparkDeleteFilter deleteFilter ) {
68
- switch (format ) {
69
- case PARQUET :
70
- return newParquetIterable (inputFile , start , length , residual , idToConstant , deleteFilter );
67
+ ReaderBuilder <?> readerBuilder =
68
+ DataFileServiceRegistry .read (
69
+ task .file ().format (),
70
+ InternalRow .class .getName (),
71
+ parquetConf != null ? parquetConf .readerType ().name () : null ,
72
+ inputFile ,
73
+ task ,
74
+ expectedSchema (),
75
+ unifiedPartitionType ,
76
+ deleteFilter )
77
+ .split (task .start (), task .length ())
78
+ .filter (task .residual ())
79
+ .caseSensitive (caseSensitive ())
80
+ // Spark eagerly consumes the batches. So the underlying memory allocated could be
81
+ // reused
82
+ // without worrying about subsequent reads clobbering over each other. This improves
83
+ // read performance as every batch read doesn't have to pay the cost of allocating
84
+ // memory.
85
+ .reuseContainers ()
86
+ .withNameMapping (nameMapping ());
87
+ if (parquetConf != null ) {
88
+ readerBuilder = readerBuilder .recordsPerBatch (parquetConf .batchSize ());
89
+ } else if (orcConf != null ) {
90
+ readerBuilder = readerBuilder .recordsPerBatch (orcConf .batchSize ());
91
+ }
92
+
93
+ return readerBuilder .build ();
94
+ }
71
95
72
- case ORC :
73
- return newOrcIterable (inputFile , start , length , residual , idToConstant );
96
+ public static class IcebergParquetReaderService extends ServiceBase
97
+ implements org .apache .iceberg .io .datafile .ReaderService {
98
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
99
+ public IcebergParquetReaderService () {
100
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .ICEBERG .name ());
101
+ }
74
102
75
- default :
76
- throw new UnsupportedOperationException (
77
- "Format: " + format + " not supported for batched reads" );
103
+ @ Override
104
+ public ReaderBuilder <?> builder (
105
+ InputFile inputFile ,
106
+ ContentScanTask <?> task ,
107
+ Schema readSchema ,
108
+ Types .StructType unifiedPartitionType ,
109
+ DeleteFilter <?> deleteFilter ) {
110
+ // get required schema if there are deletes
111
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
112
+ return Parquet .read (inputFile )
113
+ .project (requiredSchema )
114
+ .createBatchedReaderFunc (
115
+ fileSchema ->
116
+ VectorizedSparkParquetReaders .buildReader (
117
+ requiredSchema ,
118
+ fileSchema ,
119
+ constantsMap (task , readSchema , unifiedPartitionType ),
120
+ (DeleteFilter <InternalRow >) deleteFilter ));
78
121
}
79
122
}
80
123
81
- private CloseableIterable <ColumnarBatch > newParquetIterable (
82
- InputFile inputFile ,
83
- long start ,
84
- long length ,
85
- Expression residual ,
86
- Map <Integer , ?> idToConstant ,
87
- SparkDeleteFilter deleteFilter ) {
88
- // get required schema if there are deletes
89
- Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : expectedSchema ();
124
+ public static class CometParquetReaderService extends ServiceBase
125
+ implements org .apache .iceberg .io .datafile .ReaderService {
126
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
127
+ public CometParquetReaderService () {
128
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .COMET .name ());
129
+ }
90
130
91
- return Parquet .read (inputFile )
92
- .project (requiredSchema )
93
- .split (start , length )
94
- .createBatchedReaderFunc (
95
- fileSchema -> {
96
- if (parquetConf .readerType () == ParquetReaderType .COMET ) {
97
- return VectorizedSparkParquetReaders .buildCometReader (
98
- requiredSchema , fileSchema , idToConstant , deleteFilter );
99
- } else {
100
- return VectorizedSparkParquetReaders .buildReader (
101
- requiredSchema , fileSchema , idToConstant , deleteFilter );
102
- }
103
- })
104
- .recordsPerBatch (parquetConf .batchSize ())
105
- .filter (residual )
106
- .caseSensitive (caseSensitive ())
107
- // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
108
- // without worrying about subsequent reads clobbering over each other. This improves
109
- // read performance as every batch read doesn't have to pay the cost of allocating memory.
110
- .reuseContainers ()
111
- .withNameMapping (nameMapping ())
112
- .build ();
131
+ @ Override
132
+ public ReaderBuilder <?> builder (
133
+ InputFile inputFile ,
134
+ ContentScanTask <?> task ,
135
+ Schema readSchema ,
136
+ Types .StructType unifiedPartitionType ,
137
+ DeleteFilter <?> deleteFilter ) {
138
+ // get required schema if there are deletes
139
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
140
+ return Parquet .read (inputFile )
141
+ .project (requiredSchema )
142
+ .createBatchedReaderFunc (
143
+ fileSchema ->
144
+ VectorizedSparkParquetReaders .buildCometReader (
145
+ requiredSchema ,
146
+ fileSchema ,
147
+ constantsMap (task , readSchema , unifiedPartitionType ),
148
+ (DeleteFilter <InternalRow >) deleteFilter ));
149
+ }
113
150
}
114
151
115
- private CloseableIterable <ColumnarBatch > newOrcIterable (
116
- InputFile inputFile ,
117
- long start ,
118
- long length ,
119
- Expression residual ,
120
- Map <Integer , ?> idToConstant ) {
121
- Set <Integer > constantFieldIds = idToConstant .keySet ();
122
- Set <Integer > metadataFieldIds = MetadataColumns .metadataFieldIds ();
123
- Sets .SetView <Integer > constantAndMetadataFieldIds =
124
- Sets .union (constantFieldIds , metadataFieldIds );
125
- Schema schemaWithoutConstantAndMetadataFields =
126
- TypeUtil .selectNot (expectedSchema (), constantAndMetadataFieldIds );
152
+ public static class ORCReaderService extends ServiceBase
153
+ implements org .apache .iceberg .io .datafile .ReaderService {
154
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
155
+ public ORCReaderService () {
156
+ super (FileFormat .ORC , InternalRow .class .getName ());
157
+ }
127
158
128
- return ORC .read (inputFile )
129
- .project (schemaWithoutConstantAndMetadataFields )
130
- .split (start , length )
131
- .createBatchedReaderFunc (
132
- fileSchema ->
133
- VectorizedSparkOrcReaders .buildReader (expectedSchema (), fileSchema , idToConstant ))
134
- .recordsPerBatch (orcConf .batchSize ())
135
- .filter (residual )
136
- .caseSensitive (caseSensitive ())
137
- .withNameMapping (nameMapping ())
138
- .build ();
159
+ @ Override
160
+ public ReaderBuilder <?> builder (
161
+ InputFile inputFile ,
162
+ ContentScanTask <?> task ,
163
+ Schema readSchema ,
164
+ Types .StructType unifiedPartitionType ,
165
+ DeleteFilter <?> deleteFilter ) {
166
+ Map <Integer , ?> idToConstant = constantsMap (task , readSchema , unifiedPartitionType );
167
+ return ORC .read (inputFile )
168
+ .project (ORC .schemaWithoutConstantAndMetadataFields (readSchema , idToConstant ))
169
+ .createBatchedReaderFunc (
170
+ fileSchema ->
171
+ VectorizedSparkOrcReaders .buildReader (readSchema , fileSchema , idToConstant ));
172
+ }
139
173
}
140
174
}
0 commit comments