fix

JingsongLi · JingsongLi · commit 5d7d7ca8a38b · 2025-07-15T16:48:22.000+08:00
diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java
@@ -57,6 +57,9 @@
 import java.util.stream.Collectors;
 
 import static org.apache.paimon.CoreOptions.MergeEngine.FIRST_ROW;
+import static org.apache.paimon.CoreOptions.OrderType.HILBERT;
+import static org.apache.paimon.CoreOptions.OrderType.ORDER;
+import static org.apache.paimon.CoreOptions.OrderType.ZORDER;
 import static org.apache.paimon.options.ConfigOptions.key;
 import static org.apache.paimon.options.MemorySize.VALUE_128_MB;
 import static org.apache.paimon.options.MemorySize.VALUE_256_MB;
@@ -1835,6 +1838,28 @@ public InlineElement getDescription() {
                                     + "starting from the snapshot after this one. If found, commit will be aborted. "
                                     + "If the value of this option is -1, committer will not check for its first commit.");
 
+    public static final ConfigOption<String> CLUSTERING_COLUMNS =
+            key("clustering.columns")
+                    .stringType()
+                    .noDefaultValue()
+                    .withFallbackKeys("sink.clustering.by-columns")
+                    .withDescription(
+                            "Specifies the column name(s) used for comparison during range partitioning, in the format 'columnName1,columnName2'. "
+                                    + "If not set or set to an empty string, it indicates that the range partitioning feature is not enabled. "
+                                    + "This option will be effective only for append table without primary keys and batch execution mode.");
+
+    public static final ConfigOption<String> CLUSTERING_STRATEGY =
+            key("clustering.strategy")
+                    .stringType()
+                    .defaultValue("auto")
+                    .withFallbackKeys("sink.clustering.strategy")
+                    .withDescription(
+                            "Specifies the comparison algorithm used for range partitioning, including 'zorder', 'hilbert', and 'order', "
+                                    + "corresponding to the z-order curve algorithm, hilbert curve algorithm, and basic type comparison algorithm, "
+                                    + "respectively. When not configured, it will automatically determine the algorithm based on the number of columns "
+                                    + "in 'sink.clustering.by-columns'. 'order' is used for 1 column, 'zorder' for less than 5 columns, "
+                                    + "and 'hilbert' for 5 or more columns.");
+
     private final Options options;
 
     public CoreOptions(Map<String, String> options) {
@@ -2803,6 +2828,35 @@ public Optional<Long> commitStrictModeLastSafeSnapshot() {
         return options.getOptional(COMMIT_STRICT_MODE_LAST_SAFE_SNAPSHOT);
     }
 
+    public List<String> clusteringColumns() {
+        return clusteringColumns(options.get(CLUSTERING_COLUMNS));
+    }
+
+    public OrderType clusteringStrategy(int columnSize) {
+        return clusteringStrategy(options.get(CLUSTERING_STRATEGY), columnSize);
+    }
+
+    public static List<String> clusteringColumns(String clusteringColumns) {
+        if (clusteringColumns == null || clusteringColumns.isEmpty()) {
+            return Collections.emptyList();
+        }
+        return Arrays.asList(clusteringColumns.split(","));
+    }
+
+    public static OrderType clusteringStrategy(String clusteringStrategy, int columnSize) {
+        if (clusteringStrategy.equals(CLUSTERING_STRATEGY.defaultValue())) {
+            if (columnSize == 1) {
+                return ORDER;
+            } else if (columnSize < 5) {
+                return ZORDER;
+            } else {
+                return HILBERT;
+            }
+        } else {
+            return OrderType.of(clusteringStrategy);
+        }
+    }
+
     /** Specifies the merge engine for table with primary key. */
     public enum MergeEngine implements DescribedEnum {
         DEDUPLICATE("deduplicate", "De-duplicate and keep the last row."),
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java
@@ -404,26 +404,6 @@ public class FlinkConnectorOptions {
                     .withDescription(
                             "Whether trigger partition mark done when recover from state.");
 
-    public static final ConfigOption<String> CLUSTERING_COLUMNS =
-            key("sink.clustering.by-columns")
-                    .stringType()
-                    .noDefaultValue()
-                    .withDescription(
-                            "Specifies the column name(s) used for comparison during range partitioning, in the format 'columnName1,columnName2'. "
-                                    + "If not set or set to an empty string, it indicates that the range partitioning feature is not enabled. "
-                                    + "This option will be effective only for bucket unaware table without primary keys and batch execution mode.");
-
-    public static final ConfigOption<String> CLUSTERING_STRATEGY =
-            key("sink.clustering.strategy")
-                    .stringType()
-                    .defaultValue("auto")
-                    .withDescription(
-                            "Specifies the comparison algorithm used for range partitioning, including 'zorder', 'hilbert', and 'order', "
-                                    + "corresponding to the z-order curve algorithm, hilbert curve algorithm, and basic type comparison algorithm, "
-                                    + "respectively. When not configured, it will automatically determine the algorithm based on the number of columns "
-                                    + "in 'sink.clustering.by-columns'. 'order' is used for 1 column, 'zorder' for less than 5 columns, "
-                                    + "and 'hilbert' for 5 or more columns.");
-
     public static final ConfigOption<Boolean> CLUSTERING_SORT_IN_CLUSTER =
             key("sink.clustering.sort-in-cluster")
                     .booleanType()
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java
@@ -18,6 +18,7 @@
 
 package org.apache.paimon.flink.sink;
 
+import org.apache.paimon.CoreOptions;
 import org.apache.paimon.CoreOptions.OrderType;
 import org.apache.paimon.CoreOptions.PartitionSinkStrategy;
 import org.apache.paimon.annotation.Public;
@@ -54,11 +55,12 @@
 import java.util.List;
 import java.util.Map;
 
+import static org.apache.paimon.CoreOptions.CLUSTERING_STRATEGY;
 import static org.apache.paimon.CoreOptions.OrderType.HILBERT;
 import static org.apache.paimon.CoreOptions.OrderType.ORDER;
 import static org.apache.paimon.CoreOptions.OrderType.ZORDER;
+import static org.apache.paimon.CoreOptions.clusteringStrategy;
 import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_SAMPLE_FACTOR;
-import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_STRATEGY;
 import static org.apache.paimon.flink.FlinkConnectorOptions.MIN_CLUSTERING_SAMPLE_FACTOR;
 import static org.apache.paimon.flink.sink.FlinkSink.isStreaming;
 import static org.apache.paimon.flink.sink.FlinkStreamPartitioner.partition;
@@ -146,7 +148,8 @@ public FlinkSinkBuilder clusteringIfPossible(
             int sampleFactor) {
         // The clustering will be skipped if the clustering columns are empty or the execution
         // mode is STREAMING or the table type is illegal.
-        if (clusteringColumns == null || clusteringColumns.isEmpty()) {
+        List<String> columns = CoreOptions.clusteringColumns(clusteringColumns);
+        if (columns.isEmpty()) {
             return this;
         }
         checkState(input != null, "The input stream should be specified earlier.");
@@ -159,7 +162,6 @@ public FlinkSinkBuilder clusteringIfPossible(
         }
         // If the clustering is not skipped, check the clustering column names and sample
         // factor value.
-        List<String> columns = Arrays.asList(clusteringColumns.split(","));
         List<String> fieldNames = table.schema().fieldNames();
         checkState(
                 new HashSet<>(fieldNames).containsAll(new HashSet<>(columns)),
@@ -174,17 +176,7 @@ public FlinkSinkBuilder clusteringIfPossible(
                         + MIN_CLUSTERING_SAMPLE_FACTOR
                         + ".");
         TableSortInfo.Builder sortInfoBuilder = new TableSortInfo.Builder();
-        if (clusteringStrategy.equals(CLUSTERING_STRATEGY.defaultValue())) {
-            if (columns.size() == 1) {
-                sortInfoBuilder.setSortStrategy(ORDER);
-            } else if (columns.size() < 5) {
-                sortInfoBuilder.setSortStrategy(ZORDER);
-            } else {
-                sortInfoBuilder.setSortStrategy(HILBERT);
-            }
-        } else {
-            sortInfoBuilder.setSortStrategy(OrderType.of(clusteringStrategy));
-        }
+        sortInfoBuilder.setSortStrategy(clusteringStrategy(clusteringStrategy, columns.size()));
         int upstreamParallelism = input.getParallelism();
         String sinkParallelismValue =
                 table.options().get(FlinkConnectorOptions.SINK_PARALLELISM.key());
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkTableSinkBase.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkTableSinkBase.java
@@ -43,12 +43,12 @@
 import java.util.Map;
 
 import static org.apache.paimon.CoreOptions.CHANGELOG_PRODUCER;
+import static org.apache.paimon.CoreOptions.CLUSTERING_COLUMNS;
+import static org.apache.paimon.CoreOptions.CLUSTERING_STRATEGY;
 import static org.apache.paimon.CoreOptions.LOG_CHANGELOG_MODE;
 import static org.apache.paimon.CoreOptions.MERGE_ENGINE;
-import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_COLUMNS;
 import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_SAMPLE_FACTOR;
 import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_SORT_IN_CLUSTER;
-import static org.apache.paimon.flink.FlinkConnectorOptions.CLUSTERING_STRATEGY;
 import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_PARALLELISM;
 
 /** Table sink to create sink. */
diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/sort/SparkZOrderUDF.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/sort/SparkZOrderUDF.java
@@ -75,14 +75,10 @@ public SparkZOrderUDF(int numCols, int varTypeSize, int maxOutputSize) {
         this.maxOutputSize = maxOutputSize;
     }
 
-    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
-        in.defaultReadObject();
-        inputBuffers = ThreadLocal.withInitial(() -> new ByteBuffer[numCols]);
-        inputHolder = ThreadLocal.withInitial(() -> new byte[numCols][]);
-        outputBuffer = ThreadLocal.withInitial(() -> ByteBuffer.allocate(totalOutputBytes));
-    }
-
     private ByteBuffer inputBuffer(int position, int size) {
+        if (inputBuffers == null) {
+            inputBuffers = ThreadLocal.withInitial(() -> new ByteBuffer[numCols]);
+        }
         ByteBuffer buffer = inputBuffers.get()[position];
         if (buffer == null) {
             buffer = ByteBuffer.allocate(size);
@@ -92,6 +88,13 @@ private ByteBuffer inputBuffer(int position, int size) {
     }
 
     byte[] interleaveBits(Seq<byte[]> scalaBinary) {
+        if (inputHolder == null) {
+            inputHolder = ThreadLocal.withInitial(() -> new byte[numCols][]);
+        }
+        if (outputBuffer == null) {
+            outputBuffer = ThreadLocal.withInitial(() -> ByteBuffer.allocate(totalOutputBytes));
+        }
+
         byte[][] columnsBinary =
                 JavaConverters.seqAsJavaList(scalaBinary).toArray(inputHolder.get());
         return ZOrderByteUtils.interleaveBits(columnsBinary, totalOutputBytes, outputBuffer.get());
diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/sort/ZorderSorter.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/sort/ZorderSorter.java
@@ -38,6 +38,7 @@ public ZorderSorter(FileStoreTable table, List<String> zOrderColNames) {
         checkNotEmpty();
     }
 
+    @Override
     public Dataset<Row> sort(Dataset<Row> df) {
         Column zColumn = zValue(df);
         Dataset<Row> zValueDF = df.withColumn(Z_COLUMN, zColumn);
diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/WriteIntoPaimonTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/WriteIntoPaimonTable.scala
@@ -22,8 +22,8 @@ import org.apache.paimon.CoreOptions.DYNAMIC_PARTITION_OVERWRITE
 import org.apache.paimon.options.Options
 import org.apache.paimon.spark._
 import org.apache.paimon.spark.schema.SparkSystemColumns
+import org.apache.paimon.spark.sort.TableSorter
 import org.apache.paimon.table.FileStoreTable
-
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, PaimonUtils, Row, SparkSession}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -67,6 +67,11 @@ case class WriteIntoPaimonTable(
       }
     }
 
+    val clusteringColumns = table.coreOptions().clusteringColumns()
+    if (!clusteringColumns.isEmpty) {
+      data = clusteringInput(data, clusteringColumns)
+    }
+
     val (dynamicPartitionOverwriteMode, overwritePartition) = parseSaveMode()
     // use the extra options to rebuild the table object
     updateTableWithOptions(
@@ -82,6 +87,12 @@ case class WriteIntoPaimonTable(
     Seq.empty
   }
 
+  private def clusteringInput(data: DataFrame, clusteringColumns: java.util.List[String]): DataFrame = {
+    val strategy = table.coreOptions().clusteringStrategy(table.schema().fields().size())
+    val sorter = TableSorter.getSorter(table, strategy, clusteringColumns)
+    sorter.sort(data)
+  }
+
   private def parseSaveMode(): (Boolean, Map[String, String]) = {
     var dynamicPartitionOverwriteMode = false
     val overwritePartition = saveMode match {
diff --git a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkWriteITCase.java b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkWriteITCase.java
@@ -124,6 +124,14 @@ public void testWriteWithDefaultValue() {
                         "[[1,2,my_value], [2,2,my_value], [3,2,my_value], [4,2,my_value], [5,3,my_value]]");
     }
 
+    @Test
+    public void testWriteWithClustering() {
+        spark.sql("CREATE TABLE T (a INT, b INT) TBLPROPERTIES ('clustering.columns'='a')");
+        spark.sql("INSERT INTO T VALUES (1, 1), (3, 3), (2, 2)").collectAsList();
+        List<Row> rows = spark.sql("SELECT * FROM T").collectAsList();
+        assertThat(rows.toString()).isEqualTo("[[1,1], [2,2], [3,3]]");
+    }
+
     @Test
     public void testWrite() {
         spark.sql(

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ public ZorderSorter(FileStoreTable table, List<String> zOrderColNames) {`
`38`	`38`	`checkNotEmpty();`
`39`	`39`	`}`
`40`	`40`
	`41`	`+ @Override`
`41`	`42`	`public Dataset<Row> sort(Dataset<Row> df) {`
`42`	`43`	`Column zColumn = zValue(df);`
`43`	`44`	`Dataset<Row> zValueDF = df.withColumn(Z_COLUMN, zColumn);`