Set the temporaryGcsBucket to default to fs.gs.system.bucket if exists

davidrabinowitz · davidrabinowitz · commit 347586a706ba · 2024-11-27T15:26:59.000-08:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,7 +3,8 @@
 ## Next
 * Issue #1290: Stopped using metadata for optimized count path
 * Issue #1317: Improving OpenLineage 1.24.0+ compatibility
-* PR #1311 : Improve read session expired error message
+* PR #1311: Improve read session expired error message
+* PR #1320: Set the `temporaryGcsBucket` to default to `fs.gs.system.bucket` if exists, negating the need to set it in Dataproc clusters.
 
 ## 0.41.0 - 2024-09-05
 
diff --git a/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java b/spark-bigquery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java
@@ -171,6 +171,8 @@ public static WriteMethod from(@Nullable String writeMethod) {
   public static final String BIG_NUMERIC_DEFAULT_PRECISION = "bigNumericDefaultPrecision";
   public static final String BIG_NUMERIC_DEFAULT_SCALE = "bigNumericDefaultScale";
 
+  private static final String DATAPROC_SYSTEM_BUCKET_CONFIGURATION = "fs.gs.system.bucket";
+
   TableId tableId;
   // as the config needs to be Serializable, internally it uses
   // com.google.common.base.Optional<String> but externally it uses the regular java.util.Optional
@@ -398,7 +400,10 @@ public static SparkBigQueryConfig from(
             .orNull();
     config.defaultParallelism = defaultParallelism;
     config.temporaryGcsBucket =
-        stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"));
+        stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"))
+            .or(
+                com.google.common.base.Optional.fromNullable(
+                    hadoopConfiguration.get(DATAPROC_SYSTEM_BUCKET_CONFIGURATION)));
     config.persistentGcsBucket =
         stripPrefix(getAnyOption(globalOptions, options, "persistentGcsBucket"));
     config.persistentGcsPath = getOption(options, "persistentGcsPath");
diff --git a/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java b/spark-bigquery-connector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java
@@ -1182,4 +1182,22 @@ public void testEnableListInferenceWithDefaultIntermediateFormat() {
     assertThat(config.getIntermediateFormat())
         .isEqualTo(SparkBigQueryConfig.IntermediateFormat.PARQUET_LIST_INFERENCE_ENABLED);
   }
+
+  @Test
+  public void testSystemBucketAsDefaultTemporaryGcsBucket() {
+    Configuration hadoopConfiguration = new Configuration();
+    hadoopConfiguration.set("fs.gs.system.bucket", "foo");
+    SparkBigQueryConfig config =
+        SparkBigQueryConfig.from(
+            asDataSourceOptionsMap(defaultOptions),
+            emptyMap, // allConf
+            hadoopConfiguration,
+            emptyMap, // customDefaults
+            1,
+            new SQLConf(),
+            sparkVersion,
+            /* schema */ Optional.empty(),
+            /* tableIsMandatory */ true);
+    assertThat(config.getTemporaryGcsBucket()).hasValue("foo");
+  }
 }