|
35 | 35 |
|
36 | 36 | import org.apache.flink.api.common.RuntimeExecutionMode; |
37 | 37 | import org.apache.flink.api.common.functions.ReduceFunction; |
38 | | -import org.apache.flink.api.common.typeinfo.TypeHint; |
39 | 38 | import org.apache.flink.api.common.typeinfo.TypeInformation; |
40 | 39 | import org.apache.flink.api.java.tuple.Tuple2; |
41 | | -import org.apache.flink.api.java.tuple.Tuple7; |
42 | 40 | import org.apache.flink.configuration.Configuration; |
43 | 41 | import org.apache.flink.configuration.CoreOptions; |
44 | 42 | import org.apache.flink.configuration.ExecutionOptions; |
|
58 | 56 |
|
59 | 57 | import java.io.IOException; |
60 | 58 | import java.util.ArrayList; |
61 | | -import java.util.Arrays; |
62 | 59 | import java.util.Collections; |
63 | 60 | import java.util.HashMap; |
64 | 61 | import java.util.HashSet; |
@@ -118,8 +115,7 @@ public DataStream<CleanOrphanFilesResult> doOrphanClean(StreamExecutionEnvironme |
118 | 115 | public void processElement( |
119 | 116 | String branch, |
120 | 117 | ProcessFunction<String, Tuple2<Long, Long>>.Context ctx, |
121 | | - Collector<Tuple2<Long, Long>> out) |
122 | | - throws Exception { |
| 118 | + Collector<Tuple2<Long, Long>> out) { |
123 | 119 | AtomicLong deletedFilesCount = new AtomicLong(0); |
124 | 120 | AtomicLong deletedFilesLenInBytes = new AtomicLong(0); |
125 | 121 | cleanBranchSnapshotDir( |
@@ -239,88 +235,17 @@ public void endInput() throws IOException { |
239 | 235 | }); |
240 | 236 |
|
241 | 237 | usedFiles = usedFiles.union(usedManifestFiles); |
242 | | - FileStorePathFactory pathFactory = table.store().pathFactory(); |
243 | | - List<Tuple7<String, String, String, String, String, Integer, String>> tablePaths = |
244 | | - Arrays.asList( |
245 | | - new Tuple7<>( |
246 | | - table.fullName(), |
247 | | - pathFactory.manifestPath().toString(), |
248 | | - pathFactory.indexPath().toString(), |
249 | | - pathFactory.statisticsPath().toString(), |
250 | | - pathFactory.dataFilePath().toString(), |
251 | | - partitionKeysNum, |
252 | | - table.store().options().dataFileExternalPaths())); |
253 | 238 | DataStream<Tuple2<String, Long>> candidates = |
254 | | - env.fromCollection( |
255 | | - tablePaths, |
256 | | - TypeInformation.of( |
257 | | - new TypeHint< |
258 | | - Tuple7< |
259 | | - String, |
260 | | - String, |
261 | | - String, |
262 | | - String, |
263 | | - String, |
264 | | - Integer, |
265 | | - String>>() {})) |
| 239 | + env.fromCollection(Collections.singletonList(1), TypeInformation.of(Integer.class)) |
266 | 240 | .process( |
267 | | - new ProcessFunction< |
268 | | - Tuple7< |
269 | | - String, |
270 | | - String, |
271 | | - String, |
272 | | - String, |
273 | | - String, |
274 | | - Integer, |
275 | | - String>, |
276 | | - Tuple2<String, Long>>() { |
| 241 | + new ProcessFunction<Integer, Tuple2<String, Long>>() { |
277 | 242 | @Override |
278 | 243 | public void processElement( |
279 | | - Tuple7< |
280 | | - String, |
281 | | - String, |
282 | | - String, |
283 | | - String, |
284 | | - String, |
285 | | - Integer, |
286 | | - String> |
287 | | - paths, |
288 | | - ProcessFunction< |
289 | | - Tuple7< |
290 | | - String, |
291 | | - String, |
292 | | - String, |
293 | | - String, |
294 | | - String, |
295 | | - Integer, |
296 | | - String>, |
297 | | - Tuple2<String, Long>> |
298 | | - .Context |
| 244 | + Integer i, |
| 245 | + ProcessFunction<Integer, Tuple2<String, Long>>.Context |
299 | 246 | ctx, |
300 | 247 | Collector<Tuple2<String, Long>> out) { |
301 | | - List<String> dirs = |
302 | | - listPaimonFileDirs( |
303 | | - paths.f0, paths.f1, paths.f2, |
304 | | - paths.f3, paths.f4, paths.f5, |
305 | | - paths.f6) |
306 | | - .stream() |
307 | | - .map(Path::toUri) |
308 | | - .map(Object::toString) |
309 | | - .collect(Collectors.toList()); |
310 | | - for (String dir : dirs) { |
311 | | - for (FileStatus fileStatus : |
312 | | - tryBestListingDirs(new Path(dir))) { |
313 | | - if (oldEnough(fileStatus)) { |
314 | | - out.collect( |
315 | | - new Tuple2( |
316 | | - fileStatus |
317 | | - .getPath() |
318 | | - .toUri() |
319 | | - .toString(), |
320 | | - fileStatus.getLen())); |
321 | | - } |
322 | | - } |
323 | | - } |
| 248 | + listPaimonFilesForTable(out); |
324 | 249 | } |
325 | 250 | }) |
326 | 251 | .setParallelism(1); |
@@ -398,6 +323,50 @@ public void processElement2( |
398 | 323 | return deleted; |
399 | 324 | } |
400 | 325 |
|
| 326 | + private void listPaimonFilesForTable(Collector<Tuple2<String, Long>> out) { |
| 327 | + FileStorePathFactory pathFactory = table.store().pathFactory(); |
| 328 | + List<String> dirs = |
| 329 | + listPaimonFileDirs( |
| 330 | + table.fullName(), |
| 331 | + pathFactory.manifestPath().toString(), |
| 332 | + pathFactory.indexPath().toString(), |
| 333 | + pathFactory.statisticsPath().toString(), |
| 334 | + pathFactory.dataFilePath().toString(), |
| 335 | + partitionKeysNum, |
| 336 | + table.coreOptions().dataFileExternalPaths()) |
| 337 | + .stream() |
| 338 | + .map(Path::toUri) |
| 339 | + .map(Object::toString) |
| 340 | + .collect(Collectors.toList()); |
| 341 | + Set<Path> emptyDirs = new HashSet<>(); |
| 342 | + for (String dir : dirs) { |
| 343 | + Path dirPath = new Path(dir); |
| 344 | + List<FileStatus> files = tryBestListingDirs(dirPath); |
| 345 | + for (FileStatus file : files) { |
| 346 | + if (oldEnough(file)) { |
| 347 | + out.collect(new Tuple2<>(file.getPath().toUri().toString(), file.getLen())); |
| 348 | + } |
| 349 | + } |
| 350 | + if (files.isEmpty()) { |
| 351 | + emptyDirs.add(dirPath); |
| 352 | + } |
| 353 | + } |
| 354 | + |
| 355 | + // delete empty dir |
| 356 | + while (!emptyDirs.isEmpty()) { |
| 357 | + Set<Path> newEmptyDir = new HashSet<>(); |
| 358 | + for (Path emptyDir : emptyDirs) { |
| 359 | + try { |
| 360 | + fileIO.delete(emptyDir, false); |
| 361 | + // recursive cleaning |
| 362 | + newEmptyDir.add(emptyDir.getParent()); |
| 363 | + } catch (IOException ignored) { |
| 364 | + } |
| 365 | + } |
| 366 | + emptyDirs = newEmptyDir; |
| 367 | + } |
| 368 | + } |
| 369 | + |
401 | 370 | public static CleanOrphanFilesResult executeDatabaseOrphanFiles( |
402 | 371 | StreamExecutionEnvironment env, |
403 | 372 | Catalog catalog, |
|
0 commit comments