25
25
import org .apache .seatunnel .api .table .type .SeaTunnelDataType ;
26
26
import org .apache .seatunnel .api .table .type .SeaTunnelRow ;
27
27
import org .apache .seatunnel .api .table .type .SeaTunnelRowType ;
28
+ import org .apache .seatunnel .common .utils .JsonUtils ;
28
29
import org .apache .seatunnel .connectors .seatunnel .file .config .ArchiveCompressFormat ;
29
30
import org .apache .seatunnel .connectors .seatunnel .file .config .BaseSourceConfigOptions ;
30
31
import org .apache .seatunnel .connectors .seatunnel .file .config .FileFormat ;
32
+ import org .apache .seatunnel .connectors .seatunnel .file .config .FilePathRule ;
31
33
import org .apache .seatunnel .connectors .seatunnel .file .config .HadoopConf ;
32
34
import org .apache .seatunnel .connectors .seatunnel .file .hadoop .HadoopFileSystemProxy ;
33
35
37
39
import org .apache .commons .compress .compressors .gzip .GzipParameters ;
38
40
import org .apache .hadoop .fs .FileStatus ;
39
41
42
+ import io .krakens .grok .api .Grok ;
43
+ import io .krakens .grok .api .GrokCompiler ;
44
+ import io .krakens .grok .api .Match ;
40
45
import lombok .extern .slf4j .Slf4j ;
41
46
42
47
import java .io .ByteArrayInputStream ;
@@ -70,6 +75,7 @@ public abstract class AbstractReadStrategy implements ReadStrategy {
70
75
protected static final BigDecimal [] TYPE_ARRAY_BIG_DECIMAL = new BigDecimal [0 ];
71
76
protected static final LocalDate [] TYPE_ARRAY_LOCAL_DATE = new LocalDate [0 ];
72
77
protected static final LocalDateTime [] TYPE_ARRAY_LOCAL_DATETIME = new LocalDateTime [0 ];
78
+ private static final String STATIC_PATH_PATTERN = "(/[^%]+)" ;
73
79
74
80
protected HadoopConf hadoopConf ;
75
81
protected SeaTunnelRowType seaTunnelRowType ;
@@ -86,11 +92,37 @@ public abstract class AbstractReadStrategy implements ReadStrategy {
86
92
BaseSourceConfigOptions .ARCHIVE_COMPRESS_CODEC .defaultValue ();
87
93
88
94
protected Pattern pattern ;
95
+ protected final GrokCompiler grokCompiler = GrokCompiler .newInstance ();
96
+ protected Grok grok ;
89
97
90
98
@ Override
91
99
public void init (HadoopConf conf ) {
92
100
this .hadoopConf = conf ;
93
101
this .hadoopFileSystemProxy = new HadoopFileSystemProxy (hadoopConf );
102
+ if (pluginConfig .hasPath (BaseSourceConfigOptions .FILE_PATH_RULE .key ())) {
103
+ FilePathRule filePathRule =
104
+ pluginConfig .getEnum (
105
+ FilePathRule .class , BaseSourceConfigOptions .FILE_PATH_RULE .key ());
106
+ switch (filePathRule ) {
107
+ case GROK :
108
+ Map <String , Object > unwrapped =
109
+ pluginConfig
110
+ .getObject (BaseSourceConfigOptions .GROK_PATTERN .key ())
111
+ .unwrapped ();
112
+ Map <String , String > grokPatternMap = new LinkedHashMap <>();
113
+ for (Map .Entry <String , Object > entry : unwrapped .entrySet ()) {
114
+ grokPatternMap .put (entry .getKey (), entry .getValue ().toString ());
115
+ }
116
+ this .grokCompiler .register (grokPatternMap );
117
+ grok =
118
+ grokCompiler .compile (
119
+ pluginConfig .getString (
120
+ BaseSourceConfigOptions .FILE_PATH .key ()));
121
+ break ;
122
+ default :
123
+ break ;
124
+ }
125
+ }
94
126
}
95
127
96
128
@ Override
@@ -106,7 +138,17 @@ boolean checkFileType(String path) {
106
138
107
139
@ Override
108
140
public List <String > getFileNamesByPath (String path ) throws IOException {
141
+ FilePathRule filePathRule = FilePathRule .NONE ;
142
+ if (pluginConfig .hasPath (BaseSourceConfigOptions .FILE_PATH_RULE .key ())) {
143
+ filePathRule =
144
+ pluginConfig .getEnum (
145
+ FilePathRule .class , BaseSourceConfigOptions .FILE_PATH_RULE .key ());
146
+ }
109
147
ArrayList <String > fileNames = new ArrayList <>();
148
+ if (filePathRule == FilePathRule .GROK ) {
149
+
150
+ path = extractStaticPath (path );
151
+ }
110
152
FileStatus [] stats = hadoopFileSystemProxy .listStatus (path );
111
153
for (FileStatus fileStatus : stats ) {
112
154
if (fileStatus .isDirectory ()) {
@@ -118,17 +160,27 @@ public List<String> getFileNamesByPath(String path) throws IOException {
118
160
if (!fileStatus .getPath ().getName ().equals ("_SUCCESS" )
119
161
&& !fileStatus .getPath ().getName ().startsWith ("." )) {
120
162
String filePath = fileStatus .getPath ().toString ();
121
- if (!readPartitions .isEmpty ()) {
122
- for (String readPartition : readPartitions ) {
123
- if (filePath .contains (readPartition )) {
124
- fileNames .add (filePath );
125
- this .fileNames .add (filePath );
126
- break ;
163
+
164
+ switch (filePathRule ) {
165
+ case GROK :
166
+ Match match = grok .match (filePath );
167
+ Map <String , Object > captureMap = match .capture ();
168
+ Map <String , Object > grokRuleMap =
169
+ pluginConfig
170
+ .getObject (BaseSourceConfigOptions .GROK_RULE .key ())
171
+ .unwrapped ();
172
+ FilePathRule .GrokRule grokRule =
173
+ JsonUtils .parseObject (
174
+ JsonUtils .toJsonString (grokRuleMap ),
175
+ FilePathRule .GrokRule .class );
176
+ if (isValidCapture (captureMap , grokRule )) {
177
+ addFileNameIfMatches (filePath , fileNames );
127
178
}
128
- }
129
- } else {
130
- fileNames .add (filePath );
131
- this .fileNames .add (filePath );
179
+ break ;
180
+ case NONE :
181
+ default :
182
+ addFileNameIfMatches (filePath , fileNames );
183
+ break ;
132
184
}
133
185
}
134
186
}
@@ -137,6 +189,86 @@ public List<String> getFileNamesByPath(String path) throws IOException {
137
189
return fileNames ;
138
190
}
139
191
192
+ private void addFileNameIfMatches (String filePath , List <String > fileNames ) {
193
+ if (!readPartitions .isEmpty ()) {
194
+ for (String readPartition : readPartitions ) {
195
+ if (filePath .contains (readPartition )) {
196
+ fileNames .add (filePath );
197
+ this .fileNames .add (filePath );
198
+ break ;
199
+ }
200
+ }
201
+ } else {
202
+ fileNames .add (filePath );
203
+ this .fileNames .add (filePath );
204
+ }
205
+ }
206
+
207
+ private boolean isValidCapture (Map <String , Object > captureMap , FilePathRule .GrokRule grokRule ) {
208
+ if (grokRule .getPatterns () != null ) {
209
+ for (Map .Entry <String , String > entry : grokRule .getPatterns ().entrySet ()) {
210
+ String key = entry .getKey ();
211
+ String regex = entry .getValue ();
212
+ String actualValue = (String ) captureMap .get (key );
213
+ if (actualValue == null ) {
214
+ return false ;
215
+ }
216
+ Pattern pattern = Pattern .compile (regex );
217
+ Matcher matcher = pattern .matcher (actualValue );
218
+ if (!matcher .matches ()) {
219
+ return false ;
220
+ }
221
+ }
222
+ }
223
+
224
+ if (grokRule .getTimeScopes () != null ) {
225
+ for (Map .Entry <String , FilePathRule .TimeScope > entry :
226
+ grokRule .getTimeScopes ().entrySet ()) {
227
+ String key = entry .getKey ();
228
+ FilePathRule .TimeScope timeScope = entry .getValue ();
229
+ String actualValue = (String ) captureMap .get (key );
230
+ if (actualValue == null ) {
231
+ return false ;
232
+ }
233
+ try {
234
+ long actualTime = Long .parseLong (actualValue );
235
+ long startTime = Long .parseLong (timeScope .getStart ());
236
+ long endTime = Long .parseLong (timeScope .getEnd ());
237
+ if (actualTime < startTime || actualTime > endTime ) {
238
+ return false ;
239
+ }
240
+ } catch (NumberFormatException e ) {
241
+ return false ;
242
+ }
243
+ }
244
+ }
245
+
246
+ if (grokRule .getEnumRules () != null ) {
247
+ for (Map .Entry <String , List <String >> entry : grokRule .getEnumRules ().entrySet ()) {
248
+ String key = entry .getKey ();
249
+ List <String > expectedValues = entry .getValue ();
250
+ String actualValue = (String ) captureMap .get (key );
251
+ if (actualValue == null || !expectedValues .contains (actualValue )) {
252
+ return false ;
253
+ }
254
+ }
255
+ }
256
+
257
+ return true ;
258
+ }
259
+
260
+ private static String extractStaticPath (String inputPath ) {
261
+ Pattern pattern = Pattern .compile (STATIC_PATH_PATTERN );
262
+ Matcher matcher = pattern .matcher (inputPath );
263
+
264
+ StringBuilder staticPath = new StringBuilder ();
265
+ while (matcher .find ()) {
266
+ staticPath .append (matcher .group (1 ));
267
+ }
268
+
269
+ return staticPath .toString ();
270
+ }
271
+
140
272
@ Override
141
273
public void setPluginConfig (Config pluginConfig ) {
142
274
this .pluginConfig = pluginConfig ;
0 commit comments