19
19
20
20
import org .apache .seatunnel .api .common .SeaTunnelAPIErrorCode ;
21
21
import org .apache .seatunnel .api .configuration .ReadonlyConfig ;
22
- import org .apache .seatunnel .api .serialization .DeserializationSchema ;
23
22
import org .apache .seatunnel .api .source .Collector ;
24
23
import org .apache .seatunnel .api .table .catalog .CatalogTable ;
25
24
import org .apache .seatunnel .api .table .catalog .CatalogTableUtil ;
39
38
import org .apache .seatunnel .format .csv .processor .CsvLineProcessor ;
40
39
import org .apache .seatunnel .format .csv .processor .DefaultCsvLineProcessor ;
41
40
41
+ import org .apache .commons .csv .CSVFormat ;
42
+ import org .apache .commons .csv .CSVParser ;
43
+ import org .apache .commons .csv .CSVRecord ;
44
+
42
45
import io .airlift .compress .lzo .LzopCodec ;
43
46
import lombok .extern .slf4j .Slf4j ;
44
47
47
50
import java .io .InputStream ;
48
51
import java .io .InputStreamReader ;
49
52
import java .nio .charset .StandardCharsets ;
53
+ import java .util .HashMap ;
50
54
import java .util .Map ;
51
55
import java .util .Optional ;
52
56
53
57
@ Slf4j
54
58
public class CsvReadStrategy extends AbstractReadStrategy {
55
- private DeserializationSchema < SeaTunnelRow > deserializationSchema ;
59
+ private CsvDeserializationSchema deserializationSchema ;
56
60
private String fieldDelimiter = BaseSourceConfigOptions .FIELD_DELIMITER .defaultValue ();
57
61
private DateUtils .Formatter dateFormat = BaseSourceConfigOptions .DATE_FORMAT .defaultValue ();
58
62
private DateTimeUtils .Formatter datetimeFormat =
@@ -62,6 +66,7 @@ public class CsvReadStrategy extends AbstractReadStrategy {
62
66
private CsvLineProcessor processor ;
63
67
private int [] indexes ;
64
68
private String encoding = BaseSourceConfigOptions .ENCODING .defaultValue ();
69
+ private CatalogTable inputCatalogTable ;
65
70
66
71
@ Override
67
72
public void read (String path , String tableId , Collector <SeaTunnelRow > output )
@@ -96,51 +101,54 @@ public void readProcess(
96
101
break ;
97
102
}
98
103
104
+ CSVFormat csvFormat = CSVFormat .DEFAULT ;
99
105
try (BufferedReader reader =
100
- new BufferedReader (new InputStreamReader (actualInputStream , encoding ))) {
101
- reader .lines ()
102
- .skip (skipHeaderNumber )
103
- .forEach (
104
- line -> {
105
- try {
106
- SeaTunnelRow seaTunnelRow =
107
- deserializationSchema .deserialize (
108
- line .getBytes (StandardCharsets .UTF_8 ));
109
- if (!readColumns .isEmpty ()) {
110
- // need column projection
111
- Object [] fields ;
112
- if (isMergePartition ) {
113
- fields =
114
- new Object
115
- [readColumns .size ()
116
- + partitionsMap .size ()];
117
- } else {
118
- fields = new Object [readColumns .size ()];
119
- }
120
- for (int i = 0 ; i < indexes .length ; i ++) {
121
- fields [i ] = seaTunnelRow .getField (indexes [i ]);
122
- }
123
- seaTunnelRow = new SeaTunnelRow (fields );
124
- }
125
- if (isMergePartition ) {
126
- int index = seaTunnelRowType .getTotalFields ();
127
- for (String value : partitionsMap .values ()) {
128
- seaTunnelRow .setField (index ++, value );
129
- }
130
- }
131
- seaTunnelRow .setTableId (tableId );
132
- output .collect (seaTunnelRow );
133
- } catch (IOException e ) {
134
- String errorMsg =
135
- String .format (
136
- "Deserialize this data [%s] failed, please check the origin data" ,
137
- line );
138
- throw new FileConnectorException (
139
- FileConnectorErrorCode .DATA_DESERIALIZE_FAILED ,
140
- errorMsg ,
141
- e );
142
- }
143
- });
106
+ new BufferedReader (new InputStreamReader (actualInputStream , encoding ));
107
+ CSVParser csvParser = new CSVParser (reader , csvFormat ); ) {
108
+ for (int i = 0 ; i < skipHeaderNumber ; i ++) {
109
+ if (reader .readLine () == null ) {
110
+ throw new IOException (
111
+ String .format (
112
+ "File [%s] has fewer lines than expected to skip." ,
113
+ currentFileName ));
114
+ }
115
+ }
116
+ // read lines
117
+ for (CSVRecord csvRecord : csvParser ) {
118
+ HashMap <Integer , String > fieldIdValueMap = new HashMap <>();
119
+ for (int i = 0 ; i < inputCatalogTable .getTableSchema ().getColumns ().size (); i ++) {
120
+ fieldIdValueMap .put (i , csvRecord .get (i ));
121
+ }
122
+ SeaTunnelRow seaTunnelRow = deserializationSchema .getSeaTunnelRow (fieldIdValueMap );
123
+ if (!readColumns .isEmpty ()) {
124
+ // need column projection
125
+ Object [] fields ;
126
+ if (isMergePartition ) {
127
+ fields = new Object [readColumns .size () + partitionsMap .size ()];
128
+ } else {
129
+ fields = new Object [readColumns .size ()];
130
+ }
131
+ for (int i = 0 ; i < indexes .length ; i ++) {
132
+ fields [i ] = seaTunnelRow .getField (indexes [i ]);
133
+ }
134
+ seaTunnelRow = new SeaTunnelRow (fields );
135
+ }
136
+ if (isMergePartition ) {
137
+ int index = seaTunnelRowType .getTotalFields ();
138
+ for (String value : partitionsMap .values ()) {
139
+ seaTunnelRow .setField (index ++, value );
140
+ }
141
+ }
142
+ seaTunnelRow .setTableId (tableId );
143
+ output .collect (seaTunnelRow );
144
+ }
145
+ } catch (IOException e ) {
146
+ String errorMsg =
147
+ String .format (
148
+ "Deserialize this file [%s] failed, please check the origin data" ,
149
+ currentFileName );
150
+ throw new FileConnectorException (
151
+ FileConnectorErrorCode .DATA_DESERIALIZE_FAILED , errorMsg , e );
144
152
}
145
153
}
146
154
@@ -177,6 +185,7 @@ public SeaTunnelRowType getSeaTunnelRowTypeInfo(String path) {
177
185
@ Override
178
186
public void setCatalogTable (CatalogTable catalogTable ) {
179
187
SeaTunnelRowType rowType = catalogTable .getSeaTunnelRowType ();
188
+ this .inputCatalogTable = catalogTable ;
180
189
SeaTunnelRowType userDefinedRowTypeWithPartition =
181
190
mergePartitionTypes (fileNames .get (0 ), rowType );
182
191
ReadonlyConfig readonlyConfig = ReadonlyConfig .fromConfig (pluginConfig );
0 commit comments