19
19
package org .apache .iceberg .spark .source ;
20
20
21
21
import java .util .Map ;
22
- import java . util . Set ;
22
+ import org . apache . iceberg . ContentScanTask ;
23
23
import org .apache .iceberg .FileFormat ;
24
- import org .apache .iceberg .MetadataColumns ;
25
24
import org .apache .iceberg .ScanTask ;
26
25
import org .apache .iceberg .ScanTaskGroup ;
27
26
import org .apache .iceberg .Schema ;
28
27
import org .apache .iceberg .Table ;
29
- import org .apache .iceberg .expressions .Expression ;
30
28
import org .apache .iceberg .io .CloseableIterable ;
31
29
import org .apache .iceberg .io .InputFile ;
30
+ import org .apache .iceberg .io .datafile .DataFileServiceRegistry ;
31
+ import org .apache .iceberg .io .datafile .DeleteFilter ;
32
+ import org .apache .iceberg .io .datafile .ReaderBuilder ;
33
+ import org .apache .iceberg .io .datafile .ReaderService ;
34
+ import org .apache .iceberg .io .datafile .ServiceBase ;
32
35
import org .apache .iceberg .orc .ORC ;
33
36
import org .apache .iceberg .parquet .Parquet ;
34
- import org .apache .iceberg .relocated .com .google .common .collect .Sets ;
35
37
import org .apache .iceberg .spark .OrcBatchReadConf ;
36
38
import org .apache .iceberg .spark .ParquetBatchReadConf ;
37
39
import org .apache .iceberg .spark .ParquetReaderType ;
38
40
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkOrcReaders ;
39
41
import org .apache .iceberg .spark .data .vectorized .VectorizedSparkParquetReaders ;
40
- import org .apache .iceberg . types . TypeUtil ;
42
+ import org .apache .spark . sql . catalyst . InternalRow ;
41
43
import org .apache .spark .sql .vectorized .ColumnarBatch ;
42
44
43
45
abstract class BaseBatchReader <T extends ScanTask > extends BaseReader <ColumnarBatch , T > {
@@ -58,83 +60,109 @@ abstract class BaseBatchReader<T extends ScanTask> extends BaseReader<ColumnarBa
58
60
}
59
61
60
62
protected CloseableIterable <ColumnarBatch > newBatchIterable (
61
- InputFile inputFile ,
62
- FileFormat format ,
63
- long start ,
64
- long length ,
65
- Expression residual ,
66
- Map <Integer , ?> idToConstant ,
67
- SparkDeleteFilter deleteFilter ) {
68
- switch (format ) {
69
- case PARQUET :
70
- return newParquetIterable (inputFile , start , length , residual , idToConstant , deleteFilter );
63
+ InputFile inputFile , ContentScanTask <?> task , Table table , SparkDeleteFilter deleteFilter ) {
64
+ ReaderBuilder <?> readerBuilder =
65
+ DataFileServiceRegistry .read (
66
+ task .file ().format (),
67
+ InternalRow .class .getName (),
68
+ parquetConf != null ? parquetConf .readerType ().name () : null ,
69
+ inputFile ,
70
+ task ,
71
+ expectedSchema (),
72
+ table ,
73
+ deleteFilter )
74
+ .split (task .start (), task .length ())
75
+ .filter (task .residual ())
76
+ .caseSensitive (caseSensitive ())
77
+ // Spark eagerly consumes the batches. So the underlying memory allocated could be
78
+ // reused
79
+ // without worrying about subsequent reads clobbering over each other. This improves
80
+ // read performance as every batch read doesn't have to pay the cost of allocating
81
+ // memory.
82
+ .reuseContainers ()
83
+ .withNameMapping (nameMapping ());
84
+ if (parquetConf != null ) {
85
+ readerBuilder = readerBuilder .recordsPerBatch (parquetConf .batchSize ());
86
+ } else if (orcConf != null ) {
87
+ readerBuilder = readerBuilder .recordsPerBatch (orcConf .batchSize ());
88
+ }
89
+
90
+ return readerBuilder .build ();
91
+ }
71
92
72
- case ORC :
73
- return newOrcIterable (inputFile , start , length , residual , idToConstant );
93
+ public static class IcebergParquetReaderService extends ServiceBase implements ReaderService {
94
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
95
+ public IcebergParquetReaderService () {
96
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .ICEBERG .name ());
97
+ }
74
98
75
- default :
76
- throw new UnsupportedOperationException (
77
- "Format: " + format + " not supported for batched reads" );
99
+ @ Override
100
+ public ReaderBuilder <?> builder (
101
+ InputFile inputFile ,
102
+ ContentScanTask <?> task ,
103
+ Schema readSchema ,
104
+ Table table ,
105
+ DeleteFilter <?> deleteFilter ) {
106
+ // get required schema if there are deletes
107
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
108
+ return Parquet .read (inputFile )
109
+ .project (requiredSchema )
110
+ .createBatchedReaderFunc (
111
+ fileSchema ->
112
+ VectorizedSparkParquetReaders .buildReader (
113
+ requiredSchema ,
114
+ fileSchema ,
115
+ constantsMap (task , readSchema , table ),
116
+ (DeleteFilter <InternalRow >) deleteFilter ));
78
117
}
79
118
}
80
119
81
- private CloseableIterable <ColumnarBatch > newParquetIterable (
82
- InputFile inputFile ,
83
- long start ,
84
- long length ,
85
- Expression residual ,
86
- Map <Integer , ?> idToConstant ,
87
- SparkDeleteFilter deleteFilter ) {
88
- // get required schema if there are deletes
89
- Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : expectedSchema ();
120
+ public static class CometParquetReaderService extends ServiceBase implements ReaderService {
121
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
122
+ public CometParquetReaderService () {
123
+ super (FileFormat .PARQUET , InternalRow .class .getName (), ParquetReaderType .COMET .name ());
124
+ }
90
125
91
- return Parquet .read (inputFile )
92
- .project (requiredSchema )
93
- .split (start , length )
94
- .createBatchedReaderFunc (
95
- fileSchema -> {
96
- if (parquetConf .readerType () == ParquetReaderType .COMET ) {
97
- return VectorizedSparkParquetReaders .buildCometReader (
98
- requiredSchema , fileSchema , idToConstant , deleteFilter );
99
- } else {
100
- return VectorizedSparkParquetReaders .buildReader (
101
- requiredSchema , fileSchema , idToConstant , deleteFilter );
102
- }
103
- })
104
- .recordsPerBatch (parquetConf .batchSize ())
105
- .filter (residual )
106
- .caseSensitive (caseSensitive ())
107
- // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
108
- // without worrying about subsequent reads clobbering over each other. This improves
109
- // read performance as every batch read doesn't have to pay the cost of allocating memory.
110
- .reuseContainers ()
111
- .withNameMapping (nameMapping ())
112
- .build ();
126
+ @ Override
127
+ public ReaderBuilder <?> builder (
128
+ InputFile inputFile ,
129
+ ContentScanTask <?> task ,
130
+ Schema readSchema ,
131
+ Table table ,
132
+ DeleteFilter <?> deleteFilter ) {
133
+ // get required schema if there are deletes
134
+ Schema requiredSchema = deleteFilter != null ? deleteFilter .requiredSchema () : readSchema ;
135
+ return Parquet .read (inputFile )
136
+ .project (requiredSchema )
137
+ .createBatchedReaderFunc (
138
+ fileSchema ->
139
+ VectorizedSparkParquetReaders .buildCometReader (
140
+ requiredSchema ,
141
+ fileSchema ,
142
+ constantsMap (task , readSchema , table ),
143
+ (DeleteFilter <InternalRow >) deleteFilter ));
144
+ }
113
145
}
114
146
115
- private CloseableIterable <ColumnarBatch > newOrcIterable (
116
- InputFile inputFile ,
117
- long start ,
118
- long length ,
119
- Expression residual ,
120
- Map <Integer , ?> idToConstant ) {
121
- Set <Integer > constantFieldIds = idToConstant .keySet ();
122
- Set <Integer > metadataFieldIds = MetadataColumns .metadataFieldIds ();
123
- Sets .SetView <Integer > constantAndMetadataFieldIds =
124
- Sets .union (constantFieldIds , metadataFieldIds );
125
- Schema schemaWithoutConstantAndMetadataFields =
126
- TypeUtil .selectNot (expectedSchema (), constantAndMetadataFieldIds );
147
+ public static class ORCReaderService extends ServiceBase implements ReaderService {
148
+ @ SuppressWarnings ("checkstyle:RedundantModifier" )
149
+ public ORCReaderService () {
150
+ super (FileFormat .ORC , InternalRow .class .getName ());
151
+ }
127
152
128
- return ORC .read (inputFile )
129
- .project (schemaWithoutConstantAndMetadataFields )
130
- .split (start , length )
131
- .createBatchedReaderFunc (
132
- fileSchema ->
133
- VectorizedSparkOrcReaders .buildReader (expectedSchema (), fileSchema , idToConstant ))
134
- .recordsPerBatch (orcConf .batchSize ())
135
- .filter (residual )
136
- .caseSensitive (caseSensitive ())
137
- .withNameMapping (nameMapping ())
138
- .build ();
153
+ @ Override
154
+ public ReaderBuilder <?> builder (
155
+ InputFile inputFile ,
156
+ ContentScanTask <?> task ,
157
+ Schema readSchema ,
158
+ Table table ,
159
+ DeleteFilter <?> deleteFilter ) {
160
+ Map <Integer , ?> idToConstant = constantsMap (task , readSchema , table );
161
+ return ORC .read (inputFile )
162
+ .project (ORC .schemaWithoutConstantAndMetadataFields (readSchema , idToConstant ))
163
+ .createBatchedReaderFunc (
164
+ fileSchema ->
165
+ VectorizedSparkOrcReaders .buildReader (readSchema , fileSchema , idToConstant ));
166
+ }
139
167
}
140
168
}
0 commit comments