[client] Support dynamic estimate write batch size (#976)

swuferhong · web-flow · commit 2da48d5f6b43 · 2025-06-07T18:32:14.000+08:00
diff --git a/fluss-client/src/main/java/com/alibaba/fluss/client/write/DynamicWriteBatchSizeEstimator.java b/fluss-client/src/main/java/com/alibaba/fluss/client/write/DynamicWriteBatchSizeEstimator.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2025 Alibaba Group Holding Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.alibaba.fluss.client.write;
+
+import com.alibaba.fluss.annotation.Internal;
+import com.alibaba.fluss.metadata.PhysicalTablePath;
+import com.alibaba.fluss.utils.MapUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.concurrent.ThreadSafe;
+
+import java.util.concurrent.ConcurrentHashMap;
+
+/** An estimator to estimate the buffer usage of a writeBatch. */
+@Internal
+@ThreadSafe
+public class DynamicWriteBatchSizeEstimator {
+
+    private static final Logger LOG = LoggerFactory.getLogger(DynamicWriteBatchSizeEstimator.class);
+
+    private static final double RATIO_TO_INCREASE_BATCH_SIZE = 0.9d;
+    private static final double RATIO_TO_DECREASE_BATCH_SIZE = 0.5d;
+    private final int maxBatchSize;
+    private final int pageSize;
+    private final boolean dynamicBatchSizeEnabled;
+
+    private final ConcurrentHashMap<PhysicalTablePath, Integer> estimatedBatchSizeMap;
+
+    public DynamicWriteBatchSizeEstimator(
+            boolean dynamicBatchSizeEnabled, int maxBatchSize, int pageSize) {
+        this.dynamicBatchSizeEnabled = dynamicBatchSizeEnabled;
+
+        if (dynamicBatchSizeEnabled) {
+            this.estimatedBatchSizeMap = MapUtils.newConcurrentHashMap();
+        } else {
+            this.estimatedBatchSizeMap = null;
+        }
+
+        this.maxBatchSize = maxBatchSize;
+        this.pageSize = pageSize;
+    }
+
+    public void updateEstimation(PhysicalTablePath physicalTablePath, int observedBatchSize) {
+        if (!dynamicBatchSizeEnabled) {
+            return;
+        }
+
+        int estimatedBatchSize =
+                estimatedBatchSizeMap.getOrDefault(physicalTablePath, maxBatchSize);
+        int newEstimatedBatchSize;
+        if (observedBatchSize >= estimatedBatchSize
+                || observedBatchSize > estimatedBatchSize * RATIO_TO_INCREASE_BATCH_SIZE) {
+            // To increase 1%
+            newEstimatedBatchSize = Math.min((int) (estimatedBatchSize * 1.1), maxBatchSize);
+        } else if (observedBatchSize < estimatedBatchSize * RATIO_TO_INCREASE_BATCH_SIZE
+                && observedBatchSize > estimatedBatchSize * RATIO_TO_DECREASE_BATCH_SIZE) {
+            // To decrease 5%
+            newEstimatedBatchSize = Math.max((int) (estimatedBatchSize * 0.95), pageSize);
+        } else {
+            // To decrease 10%
+            newEstimatedBatchSize = Math.max((int) (estimatedBatchSize * 0.9), pageSize);
+        }
+
+        estimatedBatchSizeMap.put(physicalTablePath, newEstimatedBatchSize);
+        LOG.debug(
+                "Set estimated batch size for {} from {} to {}",
+                physicalTablePath,
+                estimatedBatchSize,
+                newEstimatedBatchSize);
+    }
+
+    public int getEstimatedBatchSize(PhysicalTablePath physicalTablePath) {
+        return dynamicBatchSizeEnabled
+                ? estimatedBatchSizeMap.getOrDefault(physicalTablePath, maxBatchSize)
+                : maxBatchSize;
+    }
+}
diff --git a/fluss-client/src/main/java/com/alibaba/fluss/client/write/RecordAccumulator.java b/fluss-client/src/main/java/com/alibaba/fluss/client/write/RecordAccumulator.java
@@ -79,7 +79,6 @@ public final class RecordAccumulator {
     private final AtomicInteger flushesInProgress;
     private final AtomicInteger appendsInProgress;
     private final int batchSize;
-    private final int pagesPerBatch;
 
     /**
      * An artificial delay time to add before declaring a records instance that isn't full ready for
@@ -110,6 +109,7 @@ public final class RecordAccumulator {
 
     private final IdempotenceManager idempotenceManager;
     private final Clock clock;
+    private final DynamicWriteBatchSizeEstimator batchSizeEstimator;
 
     // TODO add retryBackoffMs to retry the produce request upon receiving an error.
     // TODO add deliveryTimeoutMs to report success or failure on record delivery.
@@ -132,11 +132,15 @@ public final class RecordAccumulator {
                 Math.max(1, (int) conf.get(ConfigOptions.CLIENT_WRITER_BATCH_SIZE).getBytes());
 
         this.writerBufferPool = LazyMemorySegmentPool.createWriterBufferPool(conf);
-        this.pagesPerBatch = Math.max(1, MathUtils.ceilDiv(batchSize, writerBufferPool.pageSize()));
         this.bufferAllocator = new RootAllocator(Long.MAX_VALUE);
         this.arrowWriterPool = new ArrowWriterPool(bufferAllocator);
         this.incomplete = new IncompleteBatches();
         this.nodesDrainIndex = new HashMap<>();
+        this.batchSizeEstimator =
+                new DynamicWriteBatchSizeEstimator(
+                        conf.get(ConfigOptions.CLIENT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED),
+                        batchSize,
+                        (int) conf.get(ConfigOptions.CLIENT_WRITER_BUFFER_PAGE_SIZE).getBytes());
         this.idempotenceManager = idempotenceManager;
         this.clock = clock;
         registerMetrics(writerMetricGroup);
@@ -200,7 +204,7 @@ public RecordAppendResult append(
                 return new RecordAppendResult(true, false, true);
             }
 
-            memorySegments = allocateMemorySegments(writeRecord);
+            memorySegments = allocateMemorySegments(writeRecord, physicalTablePath);
             synchronized (dq) {
                 RecordAppendResult appendResult =
                         appendNewBatch(
@@ -421,7 +425,15 @@ public Set<PhysicalTablePath> getPhysicalTablePathsInBatches() {
         return writeBatches.keySet();
     }
 
-    private List<MemorySegment> allocateMemorySegments(WriteRecord writeRecord) throws IOException {
+    private List<MemorySegment> allocateMemorySegments(
+            WriteRecord writeRecord, PhysicalTablePath physicalTablePath) throws IOException {
+        int pagesPerBatch =
+                Math.max(
+                        1,
+                        MathUtils.ceilDiv(
+                                batchSizeEstimator.getEstimatedBatchSize(physicalTablePath),
+                                writerBufferPool.pageSize()));
+
         if (writeRecord.getWriteFormat() == WriteFormat.ARROW_LOG) {
             // pre-allocate a batch memory size for Arrow, if it is not sufficient during batching,
             // it will allocate memory from heap
@@ -714,7 +726,10 @@ private List<ReadyWriteBatch> drainBatchesForOneNode(
             // the rest of the work by processing outside the lock close() is particularly expensive
             checkNotNull(batch, "batch should not be null");
             batch.close();
-            size += batch.estimatedSizeInBytes();
+            int currentBatchSize = batch.estimatedSizeInBytes();
+            size += currentBatchSize;
+            batchSizeEstimator.updateEstimation(physicalTablePath, currentBatchSize);
+
             ready.add(new ReadyWriteBatch(tableBucket, batch));
             // mark the batch as drained.
             batch.drained(System.currentTimeMillis());
diff --git a/fluss-client/src/test/java/com/alibaba/fluss/client/write/DynamicWriteBatchSizeEstimatorTest.java b/fluss-client/src/test/java/com/alibaba/fluss/client/write/DynamicWriteBatchSizeEstimatorTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2025 Alibaba Group Holding Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.alibaba.fluss.client.write;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import static com.alibaba.fluss.record.TestData.DATA1_PHYSICAL_TABLE_PATH;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Test for {@link DynamicWriteBatchSizeEstimator}. */
+public class DynamicWriteBatchSizeEstimatorTest {
+
+    private DynamicWriteBatchSizeEstimator estimator;
+
+    @BeforeEach
+    public void setup() {
+        estimator = new DynamicWriteBatchSizeEstimator(true, 1000, 100);
+    }
+
+    @Test
+    void testEstimator() {
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(1000);
+        estimator = new DynamicWriteBatchSizeEstimator(false, 1000, 100);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(1000);
+
+        estimator = new DynamicWriteBatchSizeEstimator(true, 1000, 100);
+        // test decrease 10%
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, 450);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(900);
+
+        // test decrease 5%
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, (int) (900 * 0.9) - 10);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(855);
+
+        // test increase 1%
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, 852);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH))
+                .isEqualTo((int) (855 * 1.1));
+    }
+
+    @Test
+    void testMinDecreaseToPageSize() {
+        int estimatedSize = estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH);
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, 1000);
+        while (estimatedSize > 100) {
+            estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, (int) (estimatedSize * 0.5) - 10);
+            estimatedSize = estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH);
+        }
+
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(100);
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, 0);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(100);
+    }
+
+    @Test
+    void testMaxIncreaseToMaxBatchSize() {
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(1000);
+        estimator.updateEstimation(DATA1_PHYSICAL_TABLE_PATH, 2000);
+        assertThat(estimator.getEstimatedBatchSize(DATA1_PHYSICAL_TABLE_PATH)).isEqualTo(1000);
+    }
+}
diff --git a/fluss-common/src/main/java/com/alibaba/fluss/config/ConfigOptions.java b/fluss-common/src/main/java/com/alibaba/fluss/config/ConfigOptions.java
@@ -813,6 +813,21 @@ public class ConfigOptions {
                             "The writer or walBuilder will attempt to batch records together into one batch for"
                                     + " the same bucket. This helps performance on both the client and the server.");
 
+    public static final ConfigOption<Boolean> CLIENT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED =
+            key("client.writer.dynamic-batch-size.enabled")
+                    .booleanType()
+                    .defaultValue(true)
+                    .withDescription(
+                            "Controls whether the client writer dynamically adjusts the batch size based on actual write throughput. Enabled by default. "
+                                    + "With dynamic batch sizing enabled, the writer adapts memory allocation per batch according to historical write sizes for the target table or partition. This ensures better memory utilization and performance under varying throughput conditions. The dynamic batch size is bounded: it will not exceed `"
+                                    + CLIENT_WRITER_BATCH_SIZE.key()
+                                    + "`, nor fall below `"
+                                    + CLIENT_WRITER_BUFFER_PAGE_SIZE.key()
+                                    + "`."
+                                    + "When disabled, the writer uses a fixed batch size (`"
+                                    + CLIENT_WRITER_BATCH_SIZE.key()
+                                    + "`) for all batches, this may lead to frequent memory waits and suboptimal write performance if the incoming data rate is inconsistent across partitions.");
+
     public static final ConfigOption<Duration> CLIENT_WRITER_BATCH_TIMEOUT =
             key("client.writer.batch-timeout")
                     .durationType()
diff --git a/website/docs/engine-flink/options.md b/website/docs/engine-flink/options.md
@@ -143,6 +143,7 @@ ALTER TABLE log_table SET ('table.log.ttl' = '7d');
 | client.writer.buffer.page-size                      | MemorySize | 128kb             | Size of every page in memory buffers (`client.writer.buffer.memory-size`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | client.writer.buffer.per-request-memory-size        | MemorySize | 16mb              | The minimum number of bytes that will be allocated by the writer rounded down to the closest multiple of client.writer.buffer.page-size. It must be greater than or equal to client.writer.buffer.page-size. This option allows to allocate memory in batches to have better CPU-cached friendliness due to contiguous segments.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | client.writer.batch-size                            | MemorySize | 2mb               | The writer or walBuilder will attempt to batch records together into one batch for the same bucket. This helps performance on both the client and the server.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| client.writer.dynamic-batch-size.enabled            | Boolean    | true              | Controls whether the client writer dynamically adjusts the batch size based on actual write throughput. Enabled by default. With dynamic batch sizing enabled, the writer adapts memory allocation per batch according to historical write sizes for the target table or partition. This ensures better memory utilization and performance under varying throughput conditions. The dynamic batch size is bounded: it will not exceed `client.writer.batch-size`, nor fall below `client.writer.buffer.page-size`. When disabled, the writer uses a fixed batch size (`client.writer.batch-size`) for all batches, this may lead to frequent memory waits and suboptimal write performance if the incoming data rate is inconsistent across partitions.                                                                                                                                                                                                                    |
 | client.writer.buffer.wait-timeout                   | Duration   | 2^(63)-1ns        | Defines how long the writer will block when waiting for segments to become available.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | client.writer.batch-timeout                         | Duration   | 100ms             | The writer groups ay rows that arrive in between request sends into a single batched request. Normally this occurs only under load when rows arrive faster than they can be sent out. However in some circumstances the writer may want to reduce the number of requests even under moderate load. This setting accomplishes this by adding a small amount of artificial delay, that is, rather than immediately sending out a row, the writer will wait for up to the given delay to allow other records to be sent so that the sends can be batched together. This can be thought of as analogous to Nagle's algorithm in TCP. This setting gives the upper bound on the delay for batching: once we get client.writer.batch-size worth of rows for a bucket it will be sent immediately regardless of this setting, however if we have fewer than this many bytes accumulated for this bucket we will delay for the specified time waiting for more records to show up. |
 | client.writer.bucket.no-key-assigner                | Enum       | STICKY            | The bucket assigner for no key table. For table with bucket key or primary key, we choose a bucket based on a hash of the key. For these table without bucket key and primary key, we can use this option to specify bucket assigner, the candidate assigner is ROUND_ROBIN, STICKY, the default assigner is STICKY.<br/>ROUND_ROBIN: this strategy will assign the bucket id for the input row by round robin.<br/>STICKY: this strategy will assign new bucket id only if the batch changed in record accumulator, otherwise the bucket id will be the same as the front record.                                                                                                                                                                                                                                                                                                                                                                                         |