Merge branch 'master' into streaming

yalimu-g · web-flow · commit e117a8c02014 · 2026-01-05T12:33:56.000-08:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,6 +4,7 @@
 * Added new connector, `spark-4.1-bigquery` aimed to be used in Spark 4.1. Like Spark 4.1, this connector requires at
   least Java 17 runtime. It is currently in preview mode.
 * PR #1445: Add streaming support for Spark DS v2 indirect write.
+* PR #1452: Improved the performance of the dynamic partition overwrite for RANGE_BUCKET partitioned tables.
 
 ## 0.43.1 - 2025-10-22
 * Issue #1417: Fixed ClassCastException in AWS federated identity
diff --git a/README-template.md b/README-template.md
@@ -1197,8 +1197,8 @@ val df = spark.read.format("bigquery")
 
 ### Configuring Partitioning
 
-By default the connector creates one partition per 400MB in the table being read (before filtering). This should roughly correspond to the maximum number of readers supported by the BigQuery Storage API.
-This can be configured explicitly with the <code>[maxParallelism](#properties)</code> property. BigQuery may limit the number of partitions based on server constraints.
+By default, the connector calculates the requested `maxParallelism` as the larger of `preferredMinParallelism` (which defaults to 3 times the application's default parallelism) and 20,000. BigQuery may limit the number of partitions based on server constraints.
+Both <code>[maxParallelism](#properties)</code> and <code>[preferredMinParallelism](#properties)</code> can be configured explicitly to control the number of partitions.
 
 ## Tagging BigQuery Resources
 
diff --git a/README.md b/README.md
@@ -1191,8 +1191,8 @@ val df = spark.read.format("bigquery")
 
 ### Configuring Partitioning
 
-By default the connector creates one partition per 400MB in the table being read (before filtering). This should roughly correspond to the maximum number of readers supported by the BigQuery Storage API.
-This can be configured explicitly with the <code>[maxParallelism](#properties)</code> property. BigQuery may limit the number of partitions based on server constraints.
+By default, the connector calculates the requested `maxParallelism` as the larger of `preferredMinParallelism` (which defaults to 3 times the application's default parallelism) and 20,000. BigQuery may limit the number of partitions based on server constraints.
+Both <code>[maxParallelism](#properties)</code> and <code>[preferredMinParallelism](#properties)</code> can be configured explicitly to control the number of partitions.
 
 ## Tagging BigQuery Resources
 
diff --git a/bigquery-connector-common/src/main/java/com/google/cloud/bigquery/connector/common/BigQueryUtil.java b/bigquery-connector-common/src/main/java/com/google/cloud/bigquery/connector/common/BigQueryUtil.java
@@ -769,6 +769,23 @@ static String getQueryForTimePartitionedTable(
         String.format(
             "%s(`target`.`%s`, %s)", truncFuntion, partitionField, partitionType.toString());
 
+    return createOptimizedMergeQuery(
+        destinationDefinition,
+        destinationTableName,
+        temporaryTableName,
+        extractedPartitionedSource,
+        extractedPartitionedTarget,
+        /* partitionMatchAdditionalCondition */ "TRUE");
+  }
+
+  private static String createOptimizedMergeQuery(
+      StandardTableDefinition destinationDefinition,
+      String destinationTableName,
+      String temporaryTableName,
+      String extractedPartitionedSource,
+      String extractedPartitionedTarget,
+      String partitionMatchAdditionalCondition) {
+    FieldList allFields = destinationDefinition.getSchema().getFields();
     String commaSeparatedFields =
         allFields.stream().map(Field::getName).collect(Collectors.joining("`,`", "`", "`"));
 
@@ -778,7 +795,7 @@ static String getQueryForTimePartitionedTable(
             + "MERGE `%s` AS target\n"
             + "USING `%s` AS source\n"
             + "ON FALSE\n"
-            + "WHEN NOT MATCHED BY SOURCE AND %s IN UNNEST(partitions_to_delete) THEN DELETE\n"
+            + "WHEN NOT MATCHED BY SOURCE AND (%s) AND %s IN UNNEST(partitions_to_delete) THEN DELETE\n"
             + "WHEN NOT MATCHED BY TARGET THEN\n"
             + "INSERT(%s) VALUES(%s)";
     return String.format(
@@ -787,6 +804,7 @@ static String getQueryForTimePartitionedTable(
         temporaryTableName,
         destinationTableName,
         temporaryTableName,
+        partitionMatchAdditionalCondition,
         extractedPartitionedTarget,
         commaSeparatedFields,
         commaSeparatedFields);
@@ -803,57 +821,28 @@ static String getQueryForRangePartitionedTable(
     long interval = rangePartitioning.getRange().getInterval();
 
     String partitionField = rangePartitioning.getField();
-    String extractedPartitioned =
-        "IFNULL(IF(%s.%s >= %s, 0, RANGE_BUCKET(%s.%s, GENERATE_ARRAY(%s, %s, %s))), -1)";
+
     String extractedPartitionedSource =
         String.format(
-            extractedPartitioned,
-            "source",
-            partitionField,
-            end,
-            "source",
-            partitionField,
-            start,
-            end,
-            interval);
+            "IFNULL(IF(%s >= %s, 0, RANGE_BUCKET(%s, GENERATE_ARRAY(%s, %s, %s))), -1)",
+            partitionField, end, partitionField, start, end, interval);
     String extractedPartitionedTarget =
         String.format(
-            extractedPartitioned,
-            "target",
-            partitionField,
-            end,
-            "target",
-            partitionField,
-            start,
-            end,
-            interval);
-
-    FieldList allFields = destinationDefinition.getSchema().getFields();
-    String commaSeparatedFields =
-        allFields.stream().map(Field::getName).collect(Collectors.joining("`,`", "`", "`"));
-    String booleanInjectedColumn = "_" + Long.toString(1234567890123456789L);
+            "IFNULL(IF(target.%s >= %s, 0, RANGE_BUCKET(target.%s, GENERATE_ARRAY(%s, %s, %s))), -1)",
+            partitionField, end, partitionField, start, end, interval);
+    // needed for tables that require the partition field to be in the where clause. It must be
+    // true.
+    String partitionMatchAdditionalCondition =
+        String.format(
+            "target.%s is NULL OR target.%s >= %d", partitionField, partitionField, Long.MIN_VALUE);
 
-    String queryFormat =
-        "MERGE `%s` AS target\n"
-            + "USING (SELECT * FROM `%s` CROSS JOIN UNNEST([true, false])  %s) AS source\n"
-            + "ON %s = %s AND %s AND (target.%s >= %d OR target.%s IS NULL )\n"
-            + "WHEN MATCHED THEN DELETE\n"
-            + "WHEN NOT MATCHED AND NOT %s THEN\n"
-            + "INSERT(%s) VALUES(%s)";
-    return String.format(
-        queryFormat,
+    return createOptimizedMergeQuery(
+        destinationDefinition,
         destinationTableName,
         temporaryTableName,
-        booleanInjectedColumn,
         extractedPartitionedSource,
         extractedPartitionedTarget,
-        booleanInjectedColumn,
-        partitionField,
-        BIGQUERY_INTEGER_MIN_VALUE,
-        partitionField,
-        booleanInjectedColumn,
-        commaSeparatedFields,
-        commaSeparatedFields);
+        partitionMatchAdditionalCondition);
   }
 
   // based on https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfiguration, it