Properly clean up the entire folder for jobs using streaming.

yalimu-g · yalimu-g · commit 1fb7ca51ed25 · 2025-12-08T13:57:58.000-08:00
diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryUtil.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryUtil.java
@@ -138,6 +138,19 @@ public static Path createGcsPath(
     return gcsPath;
   }
 
+  public static Path getGcsPathWithApplicationId(
+      SparkBigQueryConfig config, Configuration conf, String applicationId) {
+    String bucket = null;
+    if (config.getPersistentGcsBucket().isPresent()) {
+      bucket = config.getPersistentGcsBucket().get();
+    } else if (config.getTemporaryGcsBucket().isPresent()) {
+      bucket = config.getTemporaryGcsBucket().get();
+    } else {
+      bucket = config.getPersistentGcsBucket().get();
+    }
+    return new Path(String.format("gs://%s/.spark-bigquery-%s-*", bucket, applicationId));
+  }
+
   private static Path getUniqueGcsPath(String gcsBucket, String applicationId, Configuration conf)
       throws IOException {
     boolean needNewPath = true;
diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/BigQueryWriteHelper.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/BigQueryWriteHelper.java
@@ -92,7 +92,15 @@ public BigQueryWriteHelper(
     this.gcsPath =
         SparkBigQueryUtil.createGcsPath(config, conf, sqlContext.sparkContext().applicationId());
     this.createTemporaryPathDeleter =
-        config.getTemporaryGcsBucket().map(unused -> new IntermediateDataCleaner(gcsPath, conf));
+        config
+            .getTemporaryGcsBucket()
+            .map(
+                unused ->
+                    new IntermediateDataCleaner(
+                        gcsPath,
+                        conf,
+                        SparkBigQueryUtil.getGcsPathWithApplicationId(
+                            config, conf, sqlContext.sparkContext().applicationId())));
 
     Schema schema =
         SchemaConverters.from(SchemaConvertersConfiguration.from(config))
diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/IntermediateDataCleaner.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/IntermediateDataCleaner.java
@@ -15,7 +15,9 @@
  */
 package com.google.cloud.spark.bigquery.write;
 
+import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
@@ -32,15 +34,18 @@ public class IntermediateDataCleaner extends Thread {
   private final Path path;
   /** the hadoop configuration */
   private final Configuration conf;
+  /** the path for the job */
+  private final Path gcsPathPrefix;
 
-  public IntermediateDataCleaner(Path path, Configuration conf) {
+  public IntermediateDataCleaner(Path path, Configuration conf, Path gcsPathPrefix) {
     this.path = path;
     this.conf = conf;
+    this.gcsPathPrefix = gcsPathPrefix;
   }
 
   @Override
   public void run() {
-    deletePath();
+    deleteGcsPath();
   }
 
   public void deletePath() {
@@ -56,14 +61,44 @@ public void deletePath() {
     }
   }
 
-  public void deleteGcsPath(Path gcsPath) {
+  // Delete all GCS path matched with the application Id.
+  public void deleteGcsPath() {
+    logger.info("Deleting Gcs path " + gcsPathPrefix + " if it exists");
     try {
-      logger.info("Deleting Gcs path " + gcsPath + " if it exists");
-      FileSystem fs = gcsPath.getFileSystem(conf);
-      fs.delete(gcsPath, true); // <-- The crucial recursive delete call
-      logger.info("Successfully deleted main GCS path: {}", gcsPath);
-    } catch (Exception e) {
-      logger.error("Failed to delete main GCS path: {}", gcsPath, e);
+      FileSystem fs = FileSystem.get(gcsPathPrefix.toUri(), conf);
+      FileStatus[] statuses = fs.globStatus(gcsPathPrefix);
+
+      if (statuses == null || statuses.length == 0) {
+        logger.info("No paths found matching pattern: {}", gcsPathPrefix);
+        return;
+      }
+
+      logger.info(
+          "Found {} paths matching the pattern. Starting recursive deletion.", statuses.length);
+
+      boolean allSuccess = true;
+      for (FileStatus status : statuses) {
+        Path pathToDelete = status.getPath();
+        FileSystem deleteFs = FileSystem.get(pathToDelete.toUri(), conf);
+        boolean deleted = deleteFs.delete(pathToDelete, true);
+
+        if (deleted) {
+          logger.info("Successfully deleted path: {}", pathToDelete);
+        } else {
+          logger.error("Failed to delete path: {}", pathToDelete);
+          allSuccess = false;
+        }
+      }
+
+      if (allSuccess) {
+        logger.info("Completed cleanup for pattern: {}", gcsPathPrefix);
+      } else {
+        logger.warn(
+            "Completed cleanup, but one or more paths failed to delete for pattern: {}",
+            gcsPathPrefix);
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
     }
   }
 
diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/context/BigQueryDataSourceWriterModule.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/write/context/BigQueryDataSourceWriterModule.java
@@ -101,7 +101,12 @@ public BigQueryIndirectDataSourceWriterContext provideIndirectDataSourceWriterCo
             .map(
                 ignored ->
                     new IntermediateDataCleaner(
-                        gcsPath, spark.sparkContext().hadoopConfiguration()));
+                        gcsPath,
+                        spark.sparkContext().hadoopConfiguration(),
+                        SparkBigQueryUtil.getGcsPathWithApplicationId(
+                            tableConfig,
+                            spark.sparkContext().hadoopConfiguration(),
+                            spark.sparkContext().applicationId())));
     // based on pmkc's suggestion at https://git.io/JeWRt
     intermediateDataCleaner.ifPresent(cleaner -> Runtime.getRuntime().addShutdownHook(cleaner));
     return new BigQueryIndirectDataSourceWriterContext(