[flink] Flink sink support detect new buckets for existing table or partition

LiebingYu · LiebingYu · commit cd67eec0f0b1 · 2025-12-05T16:56:20.000+08:00
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/sink/writer/AppendSinkWriter.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/sink/writer/AppendSinkWriter.java
@@ -72,4 +72,23 @@ public void flush(boolean endOfInput) throws IOException {
     TableWriter getTableWriter() {
         return appendWriter;
     }
+
+    @Override
+    protected void updateTable() {
+        appendWriter.flush();
+
+        try {
+            table.close();
+        } catch (Exception e) {
+            LOG.warn("Exception occurs while closing Fluss table before update table.", e);
+        }
+
+        table = connection.getTable(tablePath);
+        appendWriter = table.newAppend().createWriter();
+
+        LOG.info(
+                "Update table {}, current bucket {}.",
+                tablePath,
+                table.getTableInfo().getNumBuckets());
+    }
 }
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/sink/writer/FlinkSinkWriter.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/sink/writer/FlinkSinkWriter.java
@@ -56,14 +56,16 @@ public abstract class FlinkSinkWriter<InputT> implements SinkWriter<InputT> {
 
     protected static final Logger LOG = LoggerFactory.getLogger(FlinkSinkWriter.class);
 
-    private final TablePath tablePath;
+    private static final int REFRESH_INTERVAL_MS = 60_000;
+
+    protected final TablePath tablePath;
     private final Configuration flussConfig;
     protected final RowType tableRowType;
     protected final @Nullable int[] targetColumnIndexes;
     private final MailboxExecutor mailboxExecutor;
     private final FlussSerializationSchema<InputT> serializationSchema;
 
-    private transient Connection connection;
+    protected transient Connection connection;
     protected transient Table table;
     protected transient FlinkMetricRegistry flinkMetricRegistry;
 
@@ -73,6 +75,8 @@ public abstract class FlinkSinkWriter<InputT> implements SinkWriter<InputT> {
     private transient Counter numRecordsOutErrorsCounter;
     private volatile Throwable asyncWriterException;
 
+    private volatile long lastRefreshTime;
+
     public FlinkSinkWriter(
             TablePath tablePath,
             Configuration flussConfig,
@@ -114,6 +118,7 @@ public void initialize(SinkWriterMetricGroup metricGroup) {
                 table.getTableInfo().getSchema(),
                 tableRowType);
         sanityCheck(table.getTableInfo());
+        lastRefreshTime = System.currentTimeMillis();
 
         try {
             this.serializationSchema.open(
@@ -161,6 +166,11 @@ public void write(InputT inputValue, Context context) throws IOException, Interr
         } catch (Exception e) {
             throw new IOException(e.getMessage(), e);
         }
+
+        if (System.currentTimeMillis() - lastRefreshTime > REFRESH_INTERVAL_MS) {
+            updateTable();
+            lastRefreshTime = System.currentTimeMillis();
+        }
     }
 
     @Override
@@ -242,4 +252,6 @@ protected void checkAsyncException() throws IOException {
 
     @VisibleForTesting
     abstract TableWriter getTableWriter();
+
+    protected void updateTable() {}
 }
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/sink/writer/FlinkSinkWriterTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/sink/writer/FlinkSinkWriterTest.java
@@ -20,13 +20,16 @@
 import org.apache.fluss.client.Connection;
 import org.apache.fluss.client.ConnectionFactory;
 import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.client.table.Table;
+import org.apache.fluss.client.table.scanner.log.LogScanner;
 import org.apache.fluss.config.ConfigOptions;
 import org.apache.fluss.config.Configuration;
 import org.apache.fluss.exception.NetworkException;
 import org.apache.fluss.flink.sink.serializer.RowDataSerializationSchema;
 import org.apache.fluss.flink.utils.FlinkTestBase;
 import org.apache.fluss.metadata.DatabaseDescriptor;
 import org.apache.fluss.metadata.Schema;
+import org.apache.fluss.metadata.TableBucket;
 import org.apache.fluss.metadata.TableChange;
 import org.apache.fluss.metadata.TableDescriptor;
 import org.apache.fluss.metadata.TablePath;
@@ -51,7 +54,12 @@
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
 
+import java.lang.reflect.Field;
+import java.time.Duration;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
 import java.util.function.BiConsumer;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -265,6 +273,137 @@ private FlinkSinkWriter<RowData> createSinkWriter(
                 serializationSchema);
     }
 
+    @Test
+    void testTableInfoAutoUpdate() throws Exception {
+        String testDb = "test-auto-update-db";
+        TablePath testTablePath = TablePath.of(testDb, "test-auto-update-table");
+
+        // Create database
+        admin.createDatabase(testDb, DatabaseDescriptor.EMPTY, true).get();
+
+        // Create log table with 3 buckets (no primary key)
+        TableDescriptor tableDescriptor =
+                TableDescriptor.builder()
+                        .schema(
+                                Schema.newBuilder()
+                                        .column("id", DataTypes.INT())
+                                        .column("name", DataTypes.STRING())
+                                        .build())
+                        .distributedBy(3)
+                        .build();
+        createTable(testTablePath, tableDescriptor);
+
+        Configuration clientConfig = FLUSS_CLUSTER_EXTENSION.getClientConfig();
+        MockWriterInitContext mockWriterInitContext =
+                new MockWriterInitContext(new InterceptingOperatorMetricGroup());
+
+        // Create AppendSinkWriter
+        RowType tableRowType =
+                RowType.of(
+                        new LogicalType[] {new IntType(), new CharType(10)},
+                        new String[] {"id", "name"});
+        RowDataSerializationSchema serializationSchema =
+                new RowDataSerializationSchema(true, false);
+        AppendSinkWriter<RowData> writer =
+                new AppendSinkWriter<>(
+                        testTablePath,
+                        clientConfig,
+                        tableRowType,
+                        mockWriterInitContext.getMailboxExecutor(),
+                        serializationSchema);
+
+        try {
+            writer.initialize(mockWriterInitContext.metricGroup());
+
+            // Step 1: Write data with 3 buckets, verify success
+            for (int i = 0; i < 10; i++) {
+                writer.write(
+                        GenericRowData.of(i, StringData.fromString("name" + i)),
+                        new MockSinkWriterContext());
+            }
+            writer.flush(false);
+
+            // Verify data is written to 3 buckets
+            Map<Integer, Integer> bucketCounts = countRecordsPerBucket(testTablePath, 3);
+            assertThat(bucketCounts.size()).isEqualTo(3);
+            int totalRecords = bucketCounts.values().stream().mapToInt(Integer::intValue).sum();
+            assertThat(totalRecords).isEqualTo(10);
+
+            // Step 2: Alter table bucket number to 4
+            admin.alterTable(
+                            testTablePath,
+                            Collections.singletonList(TableChange.set("bucket.num", "4")),
+                            false)
+                    .get();
+
+            // Wait for schema sync
+            FLUSS_CLUSTER_EXTENSION.waitAllSchemaSync(testTablePath, 2);
+
+            // Step 3: Force update table by setting lastRefreshTime to trigger refresh
+            Field lastRefreshTimeField = FlinkSinkWriter.class.getDeclaredField("lastRefreshTime");
+            lastRefreshTimeField.setAccessible(true);
+            lastRefreshTimeField.set(
+                    writer, System.currentTimeMillis() - 61000); // Set to 61 seconds ago
+
+            // Step 4: Write more data, should use 4 buckets now
+            for (int i = 10; i < 20; i++) {
+                writer.write(
+                        GenericRowData.of(i, StringData.fromString("name" + i)),
+                        new MockSinkWriterContext());
+            }
+            writer.flush(false);
+
+            // Step 5: Verify data is written to 4 buckets
+            Map<Integer, Integer> newBucketCounts = countRecordsPerBucket(testTablePath, 4);
+            assertThat(newBucketCounts.size()).isEqualTo(4);
+            int newTotalRecords =
+                    newBucketCounts.values().stream().mapToInt(Integer::intValue).sum();
+            assertThat(newTotalRecords).isEqualTo(20); // Total records from both writes
+
+            // Verify that we have records in all 4 buckets
+            Set<Integer> bucketsWithData = newBucketCounts.keySet();
+            assertThat(bucketsWithData).hasSize(4);
+            for (int bucket = 0; bucket < 4; bucket++) {
+                assertThat(bucketsWithData).contains(bucket);
+            }
+        } finally {
+            writer.close();
+        }
+    }
+
+    private Map<Integer, Integer> countRecordsPerBucket(TablePath tablePath, int expectedBuckets)
+            throws Exception {
+        Map<Integer, Integer> bucketCounts = new HashMap<>();
+        Configuration clientConfig = FLUSS_CLUSTER_EXTENSION.getClientConfig();
+        try (Connection connection = ConnectionFactory.createConnection(clientConfig);
+                Table table = connection.getTable(tablePath);
+                LogScanner logScanner = table.newScan().createLogScanner()) {
+            // Subscribe to all buckets from beginning
+            for (int bucket = 0; bucket < expectedBuckets; bucket++) {
+                logScanner.subscribeFromBeginning(bucket);
+            }
+
+            // Collect all records and count by bucket
+            int totalScanned = 0;
+            int maxRecords = 50; // Limit to avoid infinite loop
+            while (totalScanned < maxRecords) {
+                org.apache.fluss.client.table.scanner.log.ScanRecords scanRecords =
+                        logScanner.poll(Duration.ofSeconds(1));
+                if (scanRecords.isEmpty()) {
+                    break;
+                }
+                for (TableBucket tableBucket : scanRecords.buckets()) {
+                    int bucketId = tableBucket.getBucket();
+                    int recordCount = scanRecords.records(tableBucket).size();
+                    bucketCounts.put(
+                            bucketId, bucketCounts.getOrDefault(bucketId, 0) + recordCount);
+                    totalScanned += recordCount;
+                }
+            }
+        }
+        return bucketCounts;
+    }
+
     static class MockSinkWriterContext implements SinkWriter.Context {
         @Override
         public long currentWatermark() {