diff --git a/docs/content/docs/connectors/flink-sources/mysql-cdc.md b/docs/content/docs/connectors/flink-sources/mysql-cdc.md index 2d47d9b7d71..8216b38b56a 100644 --- a/docs/content/docs/connectors/flink-sources/mysql-cdc.md +++ b/docs/content/docs/connectors/flink-sources/mysql-cdc.md @@ -468,6 +468,13 @@ Only valid for cdc 1.x version. During a snapshot operation, the connector will For example updating an already updated value in snapshot, or deleting an already deleted entry in snapshot. These replayed change log events should be handled specially. + + scan.snapshot.filter + optional + (none) + String + When reading a table snapshot, the rows of captured tables will be filtered using the specified filter expression (AKA a SQL WHERE clause). By default, no filter is applied, meaning the entire table will be synchronized. e.g. `id > 100`. + @@ -929,6 +936,45 @@ CREATE TABLE products ( `binary_data` field in the database is of type VARBINARY(N). In some scenarios, we need to convert binary data to base64 encoded string data. This feature can be enabled by adding the parameter 'debezium.binary.handling.mode'='base64', By adding this parameter, we can map the binary field type to 'STRING' in Flink SQL, thereby obtaining base64 encoded string data. +### Snapshot Filter + +The `scan.snapshot.filter` option allows you to apply a SQL WHERE clause during snapshot reading, so that only matching rows are synchronized. This is useful for partial initial loads (e.g. migrating only recent data) without modifying the source tables. Binlog events are still captured for all rows after the snapshot phase, regardless of the filter. + +**SQL DDL Example** + +```sql +CREATE TABLE products ( + id INT NOT NULL, + name STRING, + description STRING, + PRIMARY KEY(id) NOT ENFORCED +) WITH ( + 'connector' = 'mysql-cdc', + 'hostname' = 'localhost', + 'port' = '3306', + 'username' = 'root', + 'password' = '123456', + 'database-name' = 'mydb', + 'table-name' = 'products', + 'scan.snapshot.filter' = 'id > 100' +); +``` + +**DataStream API Example** + +```java +MySqlSource mySqlSource = MySqlSource.builder() + .hostname("yourHostname") + .port(yourPort) + .databaseList("mydb") + .tableList("mydb.products") + .username("yourUsername") + .password("yourPassword") + .snapshotFilters("mydb.products", "id > 100") + .deserializer(new JsonDebeziumDeserializationSchema()) + .build(); +``` + ### Available Source metrics Metrics can help understand the progress of assignments, and the following are the supported [Flink metrics](https://nightlies.apache.org/flink/flink-docs-master/docs/ops/metrics/): diff --git a/docs/content/docs/connectors/pipeline-connectors/mysql.md b/docs/content/docs/connectors/pipeline-connectors/mysql.md index e6e61e20720..989f74216a3 100644 --- a/docs/content/docs/connectors/pipeline-connectors/mysql.md +++ b/docs/content/docs/connectors/pipeline-connectors/mysql.md @@ -371,6 +371,16 @@ pipeline: List of readable metadata from SourceRecord to be passed to downstream and could be used in transform module, split by `,`. Available readable metadata are: op_ts. + + scan.snapshot.filters + optional + (none) + List<Map<String, String>> + When reading a table snapshot, the rows of captured tables will be filtered using the specified filter expressions (SQL WHERE clauses).
+ This option is configured as a list of filter specifications, where each specification is represented as a map of string key-value pairs (as shown in the example configuration below).
+ By default, no filter is applied, meaning the entire table will be synchronized. + + @@ -405,6 +415,38 @@ source: # ... ``` +## Snapshot Filters + +The `scan.snapshot.filters` option allows you to apply per-table SQL WHERE clauses during snapshot reading, so that only matching rows are synchronized. This is useful when you want to perform a partial initial load (e.g. migrating only recent data) without modifying the source tables. + +Each entry in the list specifies a `table` pattern (using the same format as the `tables` option) and a `filter` expression. Rows that do not satisfy the expression are skipped during snapshot reading. Binlog events are still captured for all rows after the snapshot phase, regardless of the filter. + +For example, to read only orders placed after 2024-01-01 from `mydb.orders` and only active users from `mydb.users`: + +```yaml +source: + type: mysql + hostname: 127.0.0.1 + port: 3306 + username: admin + password: pass + tables: mydb\..* + server-id: 5401-5404 + scan.snapshot.filters: + - table: mydb.orders + filter: order_date >= '2024-01-01' + - table: mydb.users + filter: status = 'active' + +sink: + type: doris + # ... + +pipeline: + name: Filtered MySQL to Doris Pipeline + parallelism: 4 +``` + ## Available Source metrics Metrics can help understand the progress of assignments, and the following are the supported [Flink metrics](https://nightlies.apache.org/flink/flink-docs-master/docs/ops/metrics/): diff --git a/flink-cdc-cli/src/main/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParser.java b/flink-cdc-cli/src/main/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParser.java index 831bb5e6536..c688fb234bd 100644 --- a/flink-cdc-cli/src/main/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParser.java +++ b/flink-cdc-cli/src/main/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParser.java @@ -197,18 +197,18 @@ private PipelineDef parse(JsonNode pipelineDefJsonNode, Configuration globalPipe } private SourceDef toSourceDef(JsonNode sourceNode) { - Map sourceMap = - mapper.convertValue(sourceNode, new TypeReference>() {}); + Map sourceMap = + mapper.convertValue(sourceNode, new TypeReference>() {}); // "type" field is required String type = checkNotNull( - sourceMap.remove(TYPE_KEY), + (String) sourceMap.remove(TYPE_KEY), "Missing required field \"%s\" in source configuration", TYPE_KEY); // "name" field is optional - String name = sourceMap.remove(NAME_KEY); + String name = (String) sourceMap.remove(NAME_KEY); return new SourceDef(type, name, Configuration.fromMap(sourceMap)); } diff --git a/flink-cdc-cli/src/test/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParserTest.java b/flink-cdc-cli/src/test/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParserTest.java index c8c10db76e7..7472c422521 100644 --- a/flink-cdc-cli/src/test/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParserTest.java +++ b/flink-cdc-cli/src/test/java/org/apache/flink/cdc/cli/parser/YamlPipelineDefinitionParserTest.java @@ -29,6 +29,7 @@ import org.apache.flink.cdc.composer.definition.UdfDef; import org.apache.flink.core.fs.Path; +import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableList; import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableMap; import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableSet; import org.apache.flink.shaded.guava31.com.google.common.io.Resources; @@ -370,9 +371,9 @@ private void testSchemaEvolutionTypesParsing( "mysql", "source-database", Configuration.fromMap( - ImmutableMap.builder() + ImmutableMap.builder() .put("host", "localhost") - .put("port", "3306") + .put("port", 3306) .put("username", "admin") .put("password", "pass") .put( @@ -381,7 +382,24 @@ private void testSchemaEvolutionTypesParsing( .put( "chunk-column", "app_order_.*:id,web_order:product_id") - .put("capture-new-tables", "true") + .put("capture-new-tables", true) + .put( + "scan.snapshot.filters", + ImmutableList.of( + ImmutableMap.builder() + .put( + "table", + "db1.user_table_[0-9]+") + .put("filter", "id > 100") + .build(), + ImmutableMap.builder() + .put( + "table", + "db[1-2].[app|web]_order_\\.*") + .put( + "filter", + "city != 'China:beijing'") + .build())) .build())), new SinkDef( "kafka", @@ -508,9 +526,9 @@ void testParsingFullDefinitionFromString() throws Exception { "mysql", "source-database", Configuration.fromMap( - ImmutableMap.builder() + ImmutableMap.builder() .put("host", "localhost") - .put("port", "3306") + .put("port", 3306) .put("username", "admin") .put("password", "pass") .put( @@ -519,7 +537,24 @@ void testParsingFullDefinitionFromString() throws Exception { .put( "chunk-column", "app_order_.*:id,web_order:product_id") - .put("capture-new-tables", "true") + .put("capture-new-tables", true) + .put( + "scan.snapshot.filters", + ImmutableList.of( + ImmutableMap.builder() + .put( + "table", + "db1.user_table_[0-9]+") + .put("filter", "id > 100") + .build(), + ImmutableMap.builder() + .put( + "table", + "db[1-2].[app|web]_order_\\.*") + .put( + "filter", + "city != 'China:beijing'") + .build())) .build())), new SinkDef( "kafka", @@ -587,9 +622,9 @@ void testParsingFullDefinitionFromString() throws Exception { "mysql", null, Configuration.fromMap( - ImmutableMap.builder() + ImmutableMap.builder() .put("host", "localhost") - .put("port", "3306") + .put("port", 3306) .put("username", "admin") .put("password", "pass") .put( @@ -650,9 +685,9 @@ void testParsingFullDefinitionFromString() throws Exception { "mysql", "source-database", Configuration.fromMap( - ImmutableMap.builder() + ImmutableMap.builder() .put("host", "localhost") - .put("port", "3306") + .put("port", 3306) .put("username", "admin") .put("password", "pass") .put( @@ -661,7 +696,7 @@ void testParsingFullDefinitionFromString() throws Exception { .put( "chunk-column", "app_order_.*:id,web_order:product_id") - .put("capture-new-tables", "true") + .put("capture-new-tables", true) .build())), new SinkDef( "kafka", diff --git a/flink-cdc-cli/src/test/resources/definitions/pipeline-definition-full.yaml b/flink-cdc-cli/src/test/resources/definitions/pipeline-definition-full.yaml index dec2c25dc23..3be2d51321c 100644 --- a/flink-cdc-cli/src/test/resources/definitions/pipeline-definition-full.yaml +++ b/flink-cdc-cli/src/test/resources/definitions/pipeline-definition-full.yaml @@ -24,6 +24,11 @@ source: tables: adb.*, bdb.user_table_[0-9]+, [app|web]_order_.* chunk-column: app_order_.*:id,web_order:product_id capture-new-tables: true + scan.snapshot.filters: + - table: db1.user_table_[0-9]+ + filter: id > 100 + - table: db[1-2].[app|web]_order_\.* + filter: city != 'China:beijing' sink: type: kafka diff --git a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/configuration/Configuration.java b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/configuration/Configuration.java index c1928191e1a..eddb7a52f89 100644 --- a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/configuration/Configuration.java +++ b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/configuration/Configuration.java @@ -61,7 +61,7 @@ public Configuration(Configuration other) { } /** Creates a new configuration that is initialized with the options of the given map. */ - public static Configuration fromMap(Map map) { + public static Configuration fromMap(Map map) { final Configuration configuration = new Configuration(); configuration.confData.putAll(map); return configuration; diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/factory/MySqlDataSourceFactory.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/factory/MySqlDataSourceFactory.java index 1b3540da0bb..eae4e214ea2 100644 --- a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/factory/MySqlDataSourceFactory.java +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/factory/MySqlDataSourceFactory.java @@ -55,6 +55,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -83,6 +84,7 @@ import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST_ENABLED; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_NEWLY_ADDED_TABLE_ENABLED; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_SNAPSHOT_FETCH_SIZE; +import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_SNAPSHOT_FILTERS; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_STARTUP_MODE; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_STARTUP_SPECIFIC_OFFSET_FILE; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_STARTUP_SPECIFIC_OFFSET_GTID_SET; @@ -112,6 +114,8 @@ public class MySqlDataSourceFactory implements DataSourceFactory { private static final Logger LOG = LoggerFactory.getLogger(MySqlDataSourceFactory.class); public static final String IDENTIFIER = "mysql"; + public static final String SNAPSHOT_FILTER_TABLE_KEY = "table"; + public static final String SNAPSHOT_FILTER_FILTER_KEY = "filter"; @Override public DataSource createDataSource(Context context) { @@ -284,11 +288,78 @@ public DataSource createDataSource(Context context) { LOG.info("Add chunkKeyColumn {}.", chunkKeyColumnMap); configFactory.chunkKeyColumn(chunkKeyColumnMap); } + + List> snapshotFilters = config.get(SCAN_SNAPSHOT_FILTERS); + if (snapshotFilters != null && !snapshotFilters.isEmpty()) { + Map snapshotFiltersMap = + parseAndValidateSnapshotFilters(snapshotFilters); + LOG.info("Add snapshotFilters {}.", snapshotFiltersMap); + configFactory.snapshotFilters(snapshotFiltersMap); + } + String metadataList = config.get(METADATA_LIST); List readableMetadataList = listReadableMetadata(metadataList); return new MySqlDataSource(configFactory, readableMetadataList); } + /** + * Parses and validates snapshot filters configuration. + * + * @param snapshotFilters List of filter entries, each containing 'table' and 'filter' keys + * @return LinkedHashMap preserving insertion order, mapping table patterns to filter + * expressions + * @throws ValidationException If any entry is missing required keys or contains duplicate table + * patterns + */ + private Map parseAndValidateSnapshotFilters( + List> snapshotFilters) { + Map result = new LinkedHashMap<>(); + + for (int i = 0; i < snapshotFilters.size(); i++) { + Map entry = snapshotFilters.get(i); + + // Validate required keys + String table = entry.get(SNAPSHOT_FILTER_TABLE_KEY); + String filter = entry.get(SNAPSHOT_FILTER_FILTER_KEY); + + if (table == null || table.trim().isEmpty()) { + throw new ValidationException( + String.format( + "Snapshot filter entry at index %d is missing required key '%s'. " + + "Each entry must contain both '%s' and '%s' keys.", + i, + SNAPSHOT_FILTER_TABLE_KEY, + SNAPSHOT_FILTER_TABLE_KEY, + SNAPSHOT_FILTER_FILTER_KEY)); + } + + if (filter == null || filter.trim().isEmpty()) { + throw new ValidationException( + String.format( + "Snapshot filter entry at index %d is missing required key '%s'. " + + "Each entry must contain both '%s' and '%s' keys.", + i, + SNAPSHOT_FILTER_FILTER_KEY, + SNAPSHOT_FILTER_TABLE_KEY, + SNAPSHOT_FILTER_FILTER_KEY)); + } + + // Check for duplicates + if (result.containsKey(table)) { + throw new ValidationException( + String.format( + "Duplicate table pattern '%s' found in snapshot filters at index %d. " + + "Each table pattern can only appear once. " + + "Previous definition: '%s', Current definition: '%s'.", + table, i, result.get(table), filter)); + } + + result.put(table, filter); + } + + return result; + } + private List listReadableMetadata(String metadataList) { if (StringUtils.isNullOrWhitespaceOnly(metadataList)) { return new ArrayList<>(); @@ -358,6 +429,7 @@ public Set> optionalOptions() { options.add(PARSE_ONLINE_SCHEMA_CHANGES); options.add(SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST_ENABLED); options.add(SCAN_INCREMENTAL_SNAPSHOT_BACKFILL_SKIP); + options.add(SCAN_SNAPSHOT_FILTERS); return options; } diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceOptions.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceOptions.java index 6aff556e7fa..d116bc1a84d 100644 --- a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceOptions.java +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceOptions.java @@ -23,6 +23,8 @@ import org.apache.flink.cdc.common.configuration.ConfigOptions; import java.time.Duration; +import java.util.List; +import java.util.Map; /** Configurations for {@link MySqlDataSource}. */ @PublicEvolving @@ -330,4 +332,21 @@ public class MySqlDataSourceOptions { .defaultValue(false) .withDescription( "Whether to skip backfill in snapshot reading phase. If backfill is skipped, changes on captured tables during snapshot phase will be consumed later in change log reading phase instead of being merged into the snapshot.WARNING: Skipping backfill might lead to data inconsistency because some change log events happened within the snapshot phase might be replayed (only at-least-once semantic is promised). For example updating an already updated value in snapshot, or deleting an already deleted entry in snapshot. These replayed change log events should be handled specially."); + + @Experimental + public static final ConfigOption>> SCAN_SNAPSHOT_FILTERS = + ConfigOptions.key("scan.snapshot.filters") + .mapType() + .asList() + .noDefaultValue() + .withDescription( + "When reading a table snapshot, the rows of captured tables will be filtered using the specified filter expressions (AKA SQL WHERE clauses). " + + "By default, no filter is applied, meaning the entire table will be synchronized. " + + "This option expects a list of maps, where each map specifies a table pattern and its filter expression. " + + "The configuration format in YAML looks like:\n" + + "scan.snapshot.filters:\n" + + " - table: db1.user_table_[0-9]+\n" + + " filter: id > 100\n" + + " - table: db[1-2].[app|web]_order_\\.*\n" + + " filter: city != 'China:beijing'\n"); } diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceFactoryTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceFactoryTest.java index 75fb6590d34..76f3075a6b7 100644 --- a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceFactoryTest.java +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-mysql/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlDataSourceFactoryTest.java @@ -30,6 +30,7 @@ import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -43,6 +44,7 @@ import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_BINLOG_NEWLY_ADDED_TABLE_ENABLED; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_CHUNK_KEY_COLUMN; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST_ENABLED; +import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.SCAN_SNAPSHOT_FILTERS; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.TABLES; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.TABLES_EXCLUDE; import static org.apache.flink.cdc.connectors.mysql.source.MySqlDataSourceOptions.TREAT_TINYINT1_AS_BOOLEAN_ENABLED; @@ -331,6 +333,142 @@ void testAddChunkKeyColumns() { }); } + @Test + void testAddSnapshotFilters() { + inventoryDatabase.createAndInitialize(); + Map options = new HashMap<>(); + options.put(HOSTNAME.key(), MYSQL_CONTAINER.getHost()); + options.put(PORT.key(), String.valueOf(MYSQL_CONTAINER.getDatabasePort())); + options.put(USERNAME.key(), TEST_USER); + options.put(PASSWORD.key(), TEST_PASSWORD); + options.put(TABLES.key(), inventoryDatabase.getDatabaseName() + ".\\.*"); + options.put(SCAN_SNAPSHOT_FILTERS.key(), mockSnapshotFilters()); + Factory.Context context = new MockContext(Configuration.fromMap(options)); + + MySqlDataSourceFactory factory = new MySqlDataSourceFactory(); + MySqlDataSource dataSource = (MySqlDataSource) factory.createDataSource(context); + + assertThat(dataSource.getSourceConfig().getSnapshotFilters()) + .isNotEmpty() + .isEqualTo( + new HashMap() { + { + put( + inventoryDatabase.getDatabaseName() + ".multi_max_\\.*", + "id > 200"); + put(inventoryDatabase.getDatabaseName() + ".products", "1 = 0"); + put( + inventoryDatabase.getDatabaseName() + ".customers", + "city != 'China:beijing'"); + } + }); + } + + @Test + void testSnapshotFiltersMissingTableKey() { + inventoryDatabase.createAndInitialize(); + Map options = new HashMap<>(); + options.put(HOSTNAME.key(), MYSQL_CONTAINER.getHost()); + options.put(PORT.key(), String.valueOf(MYSQL_CONTAINER.getDatabasePort())); + options.put(USERNAME.key(), TEST_USER); + options.put(PASSWORD.key(), TEST_PASSWORD); + options.put(TABLES.key(), inventoryDatabase.getDatabaseName() + ".\\.*"); + + // Create snapshot filters with missing 'table' key + List> snapshotFilters = new ArrayList<>(); + Map filter1 = new HashMap<>(); + filter1.put("filter", "id > 100"); + // Missing 'table' key + snapshotFilters.add(filter1); + options.put(SCAN_SNAPSHOT_FILTERS.key(), snapshotFilters); + + Factory.Context context = new MockContext(Configuration.fromMap(options)); + MySqlDataSourceFactory factory = new MySqlDataSourceFactory(); + + assertThatThrownBy(() -> factory.createDataSource(context)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("missing required key 'table'"); + } + + @Test + void testSnapshotFiltersMissingFilterKey() { + inventoryDatabase.createAndInitialize(); + Map options = new HashMap<>(); + options.put(HOSTNAME.key(), MYSQL_CONTAINER.getHost()); + options.put(PORT.key(), String.valueOf(MYSQL_CONTAINER.getDatabasePort())); + options.put(USERNAME.key(), TEST_USER); + options.put(PASSWORD.key(), TEST_PASSWORD); + options.put(TABLES.key(), inventoryDatabase.getDatabaseName() + ".\\.*"); + + // Create snapshot filters with missing 'filter' key + List> snapshotFilters = new ArrayList<>(); + Map filter1 = new HashMap<>(); + filter1.put("table", inventoryDatabase.getDatabaseName() + ".products"); + // Missing 'filter' key + snapshotFilters.add(filter1); + options.put(SCAN_SNAPSHOT_FILTERS.key(), snapshotFilters); + + Factory.Context context = new MockContext(Configuration.fromMap(options)); + MySqlDataSourceFactory factory = new MySqlDataSourceFactory(); + + assertThatThrownBy(() -> factory.createDataSource(context)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("missing required key 'filter'"); + } + + @Test + void testSnapshotFiltersDuplicateTablePattern() { + inventoryDatabase.createAndInitialize(); + Map options = new HashMap<>(); + options.put(HOSTNAME.key(), MYSQL_CONTAINER.getHost()); + options.put(PORT.key(), String.valueOf(MYSQL_CONTAINER.getDatabasePort())); + options.put(USERNAME.key(), TEST_USER); + options.put(PASSWORD.key(), TEST_PASSWORD); + options.put(TABLES.key(), inventoryDatabase.getDatabaseName() + ".\\.*"); + + // Create snapshot filters with duplicate table pattern + List> snapshotFilters = new ArrayList<>(); + Map filter1 = new HashMap<>(); + filter1.put("table", inventoryDatabase.getDatabaseName() + ".products"); + filter1.put("filter", "id > 100"); + snapshotFilters.add(filter1); + + Map filter2 = new HashMap<>(); + filter2.put("table", inventoryDatabase.getDatabaseName() + ".products"); + filter2.put("filter", "id > 200"); + snapshotFilters.add(filter2); + + options.put(SCAN_SNAPSHOT_FILTERS.key(), snapshotFilters); + + Factory.Context context = new MockContext(Configuration.fromMap(options)); + MySqlDataSourceFactory factory = new MySqlDataSourceFactory(); + + assertThatThrownBy(() -> factory.createDataSource(context)) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Duplicate table pattern"); + } + + private List> mockSnapshotFilters() { + List> snapshotFilters = new ArrayList<>(); + + Map filter1 = new HashMap<>(); + filter1.put("table", inventoryDatabase.getDatabaseName() + ".multi_max_\\.*"); + filter1.put("filter", "id > 200"); + snapshotFilters.add(filter1); + + Map filter2 = new HashMap<>(); + filter2.put("table", inventoryDatabase.getDatabaseName() + ".products"); + filter2.put("filter", "1 = 0"); + snapshotFilters.add(filter2); + + Map filter3 = new HashMap<>(); + filter3.put("table", inventoryDatabase.getDatabaseName() + ".customers"); + filter3.put("filter", "city != 'China:beijing'"); + snapshotFilters.add(filter3); + + return snapshotFilters; + } + class MockContext implements Factory.Context { Configuration factoryConfiguration; diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/debezium/task/MySqlSnapshotSplitReadTask.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/debezium/task/MySqlSnapshotSplitReadTask.java index f729f59d05f..df4413482c5 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/debezium/task/MySqlSnapshotSplitReadTask.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/debezium/task/MySqlSnapshotSplitReadTask.java @@ -24,6 +24,7 @@ import org.apache.flink.cdc.connectors.mysql.source.config.MySqlSourceConfig; import org.apache.flink.cdc.connectors.mysql.source.offset.BinlogOffset; import org.apache.flink.cdc.connectors.mysql.source.split.MySqlSnapshotSplit; +import org.apache.flink.cdc.connectors.mysql.source.utils.SnapshotFilterUtils; import org.apache.flink.cdc.connectors.mysql.source.utils.StatementUtils; import org.apache.flink.cdc.connectors.mysql.source.utils.hooks.SnapshotPhaseHooks; @@ -83,6 +84,7 @@ public class MySqlSnapshotSplitReadTask private final SnapshotPhaseHooks hooks; private final boolean isBackfillSkipped; + private final String snapshotFilter; public MySqlSnapshotSplitReadTask( MySqlSourceConfig sourceConfig, @@ -109,6 +111,9 @@ public MySqlSnapshotSplitReadTask( this.snapshotChangeEventSourceMetrics = snapshotChangeEventSourceMetrics; this.hooks = hooks; this.isBackfillSkipped = isBackfillSkipped; + this.snapshotFilter = + SnapshotFilterUtils.getSnapshotFilter( + sourceConfig.getSnapshotFilters(), snapshotSplit.getTableId()); } @Override @@ -242,12 +247,21 @@ private void createDataEventsForTable( long exportStart = clock.currentTimeInMillis(); LOG.info("Exporting data from split '{}' of table {}", snapshotSplit.splitId(), table.id()); + if (snapshotFilter != null) { + LOG.info( + "Filter for split '{}' of table {} is: {}", + snapshotSplit.splitId(), + table.id(), + snapshotFilter); + } + final String selectSql = StatementUtils.buildSplitScanQuery( snapshotSplit.getTableId(), snapshotSplit.getSplitKeyType(), snapshotSplit.getSplitStart() == null, - snapshotSplit.getSplitEnd() == null); + snapshotSplit.getSplitEnd() == null, + snapshotFilter); LOG.info( "For split '{}' of table {} using select statement: '{}'", snapshotSplit.splitId(), diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceBuilder.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceBuilder.java index 93fa2a0d36a..e1ffd308432 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceBuilder.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceBuilder.java @@ -312,6 +312,15 @@ public MySqlSourceBuilder assignUnboundedChunkFirst(boolean assignUnboundedCh return this; } + /** + * When reading a table snapshot, the rows of captured tables will be filtered using the + * specified filter expression (AKA a SQL WHERE clause). + */ + public MySqlSourceBuilder snapshotFilters(String table, String filter) { + this.configFactory.snapshotFilters(table, filter); + return this; + } + /** * Build the {@link MySqlSource}. * diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitter.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitter.java index b7efb9ab8ec..14243fbeb19 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitter.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitter.java @@ -26,6 +26,7 @@ import org.apache.flink.cdc.connectors.mysql.source.split.MySqlSnapshotSplit; import org.apache.flink.cdc.connectors.mysql.source.utils.ChunkUtils; import org.apache.flink.cdc.connectors.mysql.source.utils.ObjectUtils; +import org.apache.flink.cdc.connectors.mysql.source.utils.SnapshotFilterUtils; import org.apache.flink.cdc.connectors.mysql.source.utils.StatementUtils; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalTypeRoot; @@ -151,9 +152,13 @@ private void analyzeTable(MySqlPartition partition, TableId tableId) { splitType = ChunkUtils.getChunkKeyColumnType( splitColumn, sourceConfig.isTreatTinyInt1AsBoolean()); + String filter = + SnapshotFilterUtils.getSnapshotFilter( + sourceConfig.getSnapshotFilters(), tableId); minMaxOfSplitColumn = - StatementUtils.queryMinMax(jdbcConnection, tableId, splitColumn.name()); - approximateRowCnt = StatementUtils.queryApproximateRowCnt(jdbcConnection, tableId); + StatementUtils.queryMinMax(jdbcConnection, tableId, splitColumn.name(), filter); + approximateRowCnt = + StatementUtils.queryRowCnt(jdbcConnection, tableId, splitColumn.name(), filter); } catch (Exception e) { throw new RuntimeException("Fail to analyze table in chunk splitter.", e); } @@ -171,6 +176,8 @@ private MySqlSnapshotSplit splitOneUnevenlySizedChunk(MySqlPartition partition, nextChunkStart == ChunkSplitterState.ChunkBound.START_BOUND ? "null" : chunkStartVal.toString()); + String filter = + SnapshotFilterUtils.getSnapshotFilter(sourceConfig.getSnapshotFilters(), tableId); // we start from [null, min + chunk_size) and avoid [null, min) Object chunkEnd = nextChunkEnd( @@ -181,7 +188,8 @@ private MySqlSnapshotSplit splitOneUnevenlySizedChunk(MySqlPartition partition, tableId, splitColumn.name(), minMaxOfSplitColumn[1], - chunkSize); + chunkSize, + filter); // may sleep a while to avoid DDOS on MySQL server maySleep(nextChunkId, tableId); if (chunkEnd != null && ObjectUtils.compare(chunkEnd, minMaxOfSplitColumn[1]) <= 0) { @@ -329,19 +337,20 @@ Object nextChunkEnd( TableId tableId, String splitColumnName, Object max, - int chunkSize) + int chunkSize, + @Nullable String filter) throws SQLException { // chunk end might be null when max values are removed Object chunkEnd = StatementUtils.queryNextChunkMax( - jdbc, tableId, splitColumnName, chunkSize, previousChunkEnd); + jdbc, tableId, splitColumnName, chunkSize, previousChunkEnd, filter); if (chunkEnd == null) { return null; } if (Objects.equals(previousChunkEnd, chunkEnd)) { // we don't allow equal chunk start and end, // should query the next one larger than chunkEnd - chunkEnd = StatementUtils.queryMin(jdbc, tableId, splitColumnName, chunkEnd); + chunkEnd = StatementUtils.queryMin(jdbc, tableId, splitColumnName, chunkEnd, filter); // queryMin will return null when the chunkEnd is the max value, // this will happen when the mysql table ignores the capitalization. diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfig.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfig.java index cf456fcaed0..d7a077e85dc 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfig.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfig.java @@ -72,6 +72,7 @@ public class MySqlSourceConfig implements Serializable { private final boolean parseOnLineSchemaChanges; public static boolean useLegacyJsonFormat = true; private final boolean assignUnboundedChunkFirst; + private final Map snapshotFilters; // -------------------------------------------------------------------------------------------- // Debezium Configurations @@ -112,7 +113,8 @@ public class MySqlSourceConfig implements Serializable { boolean parseOnLineSchemaChanges, boolean treatTinyInt1AsBoolean, boolean useLegacyJsonFormat, - boolean assignUnboundedChunkFirst) { + boolean assignUnboundedChunkFirst, + Map snapshotFilters) { this.hostname = checkNotNull(hostname); this.port = port; this.username = checkNotNull(username); @@ -158,6 +160,7 @@ public class MySqlSourceConfig implements Serializable { this.treatTinyInt1AsBoolean = treatTinyInt1AsBoolean; this.useLegacyJsonFormat = useLegacyJsonFormat; this.assignUnboundedChunkFirst = assignUnboundedChunkFirst; + this.snapshotFilters = snapshotFilters; } public String getHostname() { @@ -299,4 +302,8 @@ public boolean isSkipSnapshotBackfill() { public boolean isTreatTinyInt1AsBoolean() { return treatTinyInt1AsBoolean; } + + public Map getSnapshotFilters() { + return snapshotFilters; + } } diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfigFactory.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfigFactory.java index 569b62232db..49aea5b1aad 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfigFactory.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceConfigFactory.java @@ -78,6 +78,7 @@ public class MySqlSourceConfigFactory implements Serializable { private boolean treatTinyInt1AsBoolean = true; private boolean useLegacyJsonFormat = true; private boolean assignUnboundedChunkFirst = false; + private Map snapshotFilters = new HashMap<>(); public MySqlSourceConfigFactory hostname(String hostname) { this.hostname = hostname; @@ -341,6 +342,24 @@ public MySqlSourceConfigFactory assignUnboundedChunkFirst(boolean assignUnbounde return this; } + /** + * When reading a table snapshot, the rows of captured tables will be filtered using the + * specified filter expression (AKA a SQL WHERE clause). + */ + public MySqlSourceConfigFactory snapshotFilters(String table, String filter) { + this.snapshotFilters.put(table, filter); + return this; + } + + /** + * When reading a table snapshot, the rows of captured tables will be filtered using the + * specified filter expression (AKA a SQL WHERE clause). + */ + public MySqlSourceConfigFactory snapshotFilters(Map snapshotFilters) { + this.snapshotFilters.putAll(snapshotFilters); + return this; + } + /** Creates a new {@link MySqlSourceConfig} for the given subtask {@code subtaskId}. */ public MySqlSourceConfig createConfig(int subtaskId) { // hard code server name, because we don't need to distinguish it, docs: @@ -444,6 +463,7 @@ public MySqlSourceConfig createConfig(int subtaskId, String serverName) { parseOnLineSchemaChanges, treatTinyInt1AsBoolean, useLegacyJsonFormat, - assignUnboundedChunkFirst); + assignUnboundedChunkFirst, + snapshotFilters); } } diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java index a8e143f5fc5..a710d468bc0 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java @@ -292,4 +292,13 @@ public class MySqlSourceOptions { .defaultValue(true) .withDescription( "Whether to assign the unbounded chunks first during snapshot reading phase. This might help reduce the risk of the TaskManager experiencing an out-of-memory (OOM) error when taking a snapshot of the largest unbounded chunk."); + + @Experimental + public static final ConfigOption SCAN_SNAPSHOT_FILTER = + ConfigOptions.key("scan.snapshot.filter") + .stringType() + .noDefaultValue() + .withDescription( + "When reading a table snapshot, the rows of captured tables will be filtered using the specified filter expression (AKA a SQL WHERE clause). " + + "By default, no filter is applied, meaning the entire table will be synchronized. e.g. `id > 100`"); } diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtils.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtils.java new file mode 100644 index 00000000000..69e6b322dae --- /dev/null +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtils.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.mysql.source.utils; + +import org.apache.flink.cdc.common.schema.Selectors; + +import io.debezium.relational.TableId; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** Utilities to filter snapshot of table. */ +public class SnapshotFilterUtils { + + private SnapshotFilterUtils() {} + + private static final Map, Map> cache = + new ConcurrentHashMap<>(); + + /** + * Converts the given filters to a map keyed by {@link Selectors}, caching the result. + * + *

The cache is backed by a {@link ConcurrentHashMap} to be safe under concurrent access. To + * avoid using a mutable {@link Map} as the cache key, an immutable copy of the input filters is + * created and used as the key. + * + *

Uses {@link LinkedHashMap} to preserve insertion order, ensuring deterministic matching + * when multiple patterns could match the same table. + */ + private static Map toSelector(Map filters) { + // Create an immutable copy of the filters to avoid using a mutable map as the cache key. + // Use LinkedHashMap to preserve the user-defined order. + Map immutableFilters = + Collections.unmodifiableMap(new LinkedHashMap<>(filters)); + + return cache.computeIfAbsent( + immutableFilters, + key -> { + // Use LinkedHashMap to preserve insertion order for deterministic matching + Map snapshotFilters = new LinkedHashMap<>(); + key.forEach( + (table, filter) -> { + Selectors selector = + new Selectors.SelectorsBuilder() + .includeTables(table) + .build(); + snapshotFilters.put(selector, filter); + }); + return snapshotFilters; + }); + } + + public static String getSnapshotFilter(Map filters, TableId tableId) { + Map snapshotFilters = toSelector(filters); + + String filter = null; + for (Selectors selector : snapshotFilters.keySet()) { + if (selector.isMatch( + org.apache.flink.cdc.common.event.TableId.tableId( + tableId.catalog(), tableId.table()))) { + filter = snapshotFilters.get(selector); + break; + } + } + return filter; + } +} diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/StatementUtils.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/StatementUtils.java index c910cbfdc37..ad5fd1ce439 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/StatementUtils.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/StatementUtils.java @@ -22,6 +22,8 @@ import io.debezium.jdbc.JdbcConnection; import io.debezium.relational.TableId; +import javax.annotation.Nullable; + import java.math.BigDecimal; import java.math.BigInteger; import java.sql.Connection; @@ -38,12 +40,15 @@ public class StatementUtils { private StatementUtils() {} - public static Object[] queryMinMax(JdbcConnection jdbc, TableId tableId, String columnName) + public static Object[] queryMinMax( + JdbcConnection jdbc, TableId tableId, String columnName, @Nullable String filter) throws SQLException { final String minMaxQuery = String.format( "SELECT MIN(%s), MAX(%s) FROM %s", - quote(columnName), quote(columnName), quote(tableId)); + quote(columnName), + quote(columnName), + filter != null ? quote(tableId) + " WHERE " + filter : quote(tableId)); return jdbc.queryAndMap( minMaxQuery, rs -> { @@ -58,6 +63,29 @@ public static Object[] queryMinMax(JdbcConnection jdbc, TableId tableId, String }); } + public static Long queryRowCnt( + JdbcConnection jdbc, TableId tableId, String columnName, @Nullable String filter) + throws SQLException { + + if (filter == null) { + return queryApproximateRowCnt(jdbc, tableId); + } + + final String cntQuery = + String.format("SELECT COUNT(1) FROM %s WHERE (%s)", quote(tableId), filter); + return jdbc.queryAndMap( + cntQuery, + rs -> { + if (!rs.next()) { + // this should never happen + throw new SQLException( + String.format( + "No result returned after running query [%s]", cntQuery)); + } + return rs.getLong(1); + }); + } + public static long queryApproximateRowCnt(JdbcConnection jdbc, TableId tableId) throws SQLException { // The statement used to get approximate row count which is less @@ -97,12 +125,20 @@ public static void setSafeObject(PreparedStatement ps, int parameterIndex, Objec } public static Object queryMin( - JdbcConnection jdbc, TableId tableId, String columnName, Object excludedLowerBound) + JdbcConnection jdbc, + TableId tableId, + String columnName, + Object excludedLowerBound, + @Nullable String filter) throws SQLException { final String minQuery = String.format( "SELECT MIN(%s) FROM %s WHERE %s > ?", - quote(columnName), quote(tableId), quote(columnName)); + quote(columnName), + quote(tableId), + filter != null + ? "(" + filter + ") AND " + quote(columnName) + : quote(columnName)); return jdbc.prepareQueryAndMap( minQuery, ps -> setSafeObject(ps, 1, excludedLowerBound), @@ -122,7 +158,8 @@ public static Object queryNextChunkMax( TableId tableId, String splitColumnName, int chunkSize, - Object includedLowerBound) + Object includedLowerBound, + @Nullable String filter) throws SQLException { String quotedColumn = quote(splitColumnName); String query = @@ -133,7 +170,7 @@ public static Object queryNextChunkMax( quotedColumn, quotedColumn, quote(tableId), - quotedColumn, + filter != null ? "(" + filter + ") AND " + quotedColumn : quotedColumn, quotedColumn, chunkSize); return jdbc.prepareQueryAndMap( @@ -151,8 +188,12 @@ public static Object queryNextChunkMax( } public static String buildSplitScanQuery( - TableId tableId, RowType pkRowType, boolean isFirstSplit, boolean isLastSplit) { - return buildSplitQuery(tableId, pkRowType, isFirstSplit, isLastSplit, -1, true); + TableId tableId, + RowType pkRowType, + boolean isFirstSplit, + boolean isLastSplit, + @Nullable String filter) { + return buildSplitQuery(tableId, pkRowType, isFirstSplit, isLastSplit, -1, true, filter); } private static String buildSplitQuery( @@ -161,8 +202,9 @@ private static String buildSplitQuery( boolean isFirstSplit, boolean isLastSplit, int limitSize, - boolean isScanningData) { - final String condition; + boolean isScanningData, + @Nullable String filter) { + String condition; if (isFirstSplit && isLastSplit) { condition = null; @@ -192,6 +234,10 @@ private static String buildSplitQuery( condition = sql.toString(); } + if (filter != null) { + condition = condition == null ? filter : "(" + filter + ") AND " + condition; + } + if (isScanningData) { return buildSelectWithRowLimits( tableId, limitSize, "*", Optional.ofNullable(condition), Optional.empty()); diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSource.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSource.java index 2a1f0519435..6278c4dcbb7 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSource.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSource.java @@ -103,6 +103,7 @@ public class MySqlTableSource implements ScanTableSource, SupportsReadingMetadat private final boolean assignUnboundedChunkFirst; private final boolean appendOnly; + private final String snapshotFilter; // -------------------------------------------------------------------------------------------- // Mutable attributes @@ -144,7 +145,8 @@ public MySqlTableSource( boolean parseOnlineSchemaChanges, boolean useLegacyJsonFormat, boolean assignUnboundedChunkFirst, - boolean appendOnly) { + boolean appendOnly, + @Nullable String snapshotFilter) { this.physicalSchema = physicalSchema; this.port = port; this.hostname = checkNotNull(hostname); @@ -178,6 +180,7 @@ public MySqlTableSource( this.useLegacyJsonFormat = useLegacyJsonFormat; this.assignUnboundedChunkFirst = assignUnboundedChunkFirst; this.appendOnly = appendOnly; + this.snapshotFilter = snapshotFilter; } @Override @@ -241,6 +244,9 @@ public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { .parseOnLineSchemaChanges(parseOnlineSchemaChanges) .useLegacyJsonFormat(useLegacyJsonFormat) .assignUnboundedChunkFirst(assignUnboundedChunkFirst) + .snapshotFilters( + escapeDot(database) + "." + escapeDot(tableName), + snapshotFilter) .build(); return SourceProvider.of(parallelSource); } else { @@ -330,7 +336,8 @@ public DynamicTableSource copy() { parseOnlineSchemaChanges, useLegacyJsonFormat, assignUnboundedChunkFirst, - appendOnly); + appendOnly, + snapshotFilter); source.metadataKeys = metadataKeys; source.producedDataType = producedDataType; return source; @@ -376,7 +383,8 @@ public boolean equals(Object o) { && parseOnlineSchemaChanges == that.parseOnlineSchemaChanges && useLegacyJsonFormat == that.useLegacyJsonFormat && assignUnboundedChunkFirst == that.assignUnboundedChunkFirst - && Objects.equals(appendOnly, that.appendOnly); + && Objects.equals(appendOnly, that.appendOnly) + && Objects.equals(snapshotFilter, that.snapshotFilter); } @Override @@ -413,7 +421,8 @@ public int hashCode() { parseOnlineSchemaChanges, useLegacyJsonFormat, assignUnboundedChunkFirst, - appendOnly); + appendOnly, + snapshotFilter); } @Override @@ -438,4 +447,8 @@ Properties getParallelDbzProperties(Properties dbzProperties) { } return newDbzProperties; } + + private String escapeDot(String str) { + return str.replace(".", "\\."); + } } diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactory.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactory.java index 5ea430d94e7..72a7a3b12ac 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactory.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactory.java @@ -109,6 +109,7 @@ public DynamicTableSource createDynamicTableSource(Context context) { boolean appendOnly = config.get(MySqlSourceOptions.SCAN_READ_CHANGELOG_AS_APPEND_ONLY_ENABLED); + String snapshotFilter = config.get(MySqlSourceOptions.SCAN_SNAPSHOT_FILTER); if (enableParallelRead) { validatePrimaryKeyIfEnableParallel(physicalSchema, chunkKeyColumn); @@ -124,6 +125,8 @@ public DynamicTableSource createDynamicTableSource(Context context) { MySqlSourceOptions.CONNECT_TIMEOUT, connectTimeout, Duration.ofMillis(250)); } + validateSnapshotFilterWithParallelRead(snapshotFilter, enableParallelRead); + OptionUtils.printOptions(IDENTIFIER, config.toMap()); return new MySqlTableSource( @@ -156,7 +159,8 @@ public DynamicTableSource createDynamicTableSource(Context context) { parseOnLineSchemaChanges, useLegacyJsonFormat, assignUnboundedChunkFirst, - appendOnly); + appendOnly, + snapshotFilter); } @Override @@ -206,6 +210,7 @@ public Set> optionalOptions() { options.add(MySqlSourceOptions.USE_LEGACY_JSON_FORMAT); options.add(MySqlSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST); options.add(MySqlSourceOptions.SCAN_READ_CHANGELOG_AS_APPEND_ONLY_ENABLED); + options.add(MySqlSourceOptions.SCAN_SNAPSHOT_FILTER); return options; } @@ -385,6 +390,28 @@ private void validateDistributionFactorLower(double distributionFactorLower) { distributionFactorLower)); } + /** + * Checks that snapshot filter is only used when parallel read is enabled. + * + * @param snapshotFilter The snapshot filter expression + * @param enableParallelRead Whether parallel read is enabled + * @throws ValidationException If snapshot filter is set but parallel read is disabled + */ + private void validateSnapshotFilterWithParallelRead( + @Nullable String snapshotFilter, boolean enableParallelRead) { + if (snapshotFilter != null && !snapshotFilter.isEmpty() && !enableParallelRead) { + throw new ValidationException( + String.format( + "Option '%s' can only be used when '%s' is enabled. " + + "Either enable parallel snapshot reading by setting '%s' to true, " + + "or remove the '%s' option.", + MySqlSourceOptions.SCAN_SNAPSHOT_FILTER.key(), + MySqlSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_ENABLED.key(), + MySqlSourceOptions.SCAN_INCREMENTAL_SNAPSHOT_ENABLED.key(), + MySqlSourceOptions.SCAN_SNAPSHOT_FILTER.key())); + } + } + /** Replaces the default timezone placeholder with session timezone, if applicable. */ private static ZoneId getServerTimeZone(ReadableConfig config) { final String serverTimeZone = config.get(MySqlSourceOptions.SERVER_TIME_ZONE); diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceITCase.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceITCase.java index 62f51e49512..7911150e0b7 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceITCase.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/MySqlSourceITCase.java @@ -423,7 +423,7 @@ void testSnapshotSplitReadingFailCrossCheckpoints(String tableName, String chunk RestartStrategyUtils.configureFixedDelayRestartStrategy(env, 1, 0); // The sleeping source will sleep awhile after send per record - MySqlSource sleepingSource = buildSleepingSource(tableName, chunkColumnName); + MySqlSource sleepingSource = buildSleepingSource(tableName, chunkColumnName, null); DataStreamSource source = env.fromSource(sleepingSource, WatermarkStrategy.noWatermarks(), "selfSource"); @@ -496,6 +496,191 @@ void testSnapshotSplitReadingFailCrossCheckpoints(String tableName, String chunk jobClient.cancel().get(); } + @ParameterizedTest + @MethodSource("parameters") + @SuppressWarnings({"rawtypes", "unchecked"}) + void testSnapshotFilters(String tableName, String chunkColumnName) throws Exception { + customDatabase.createAndInitialize(); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(DEFAULT_PARALLELISM); + env.enableCheckpointing(5000L); + RestartStrategyUtils.configureFixedDelayRestartStrategy(env, 1, 0); + + // Filter user with `id > 200` + // The sleeping source will sleep awhile after send per record + MySqlSource sleepingSource = + buildSleepingSource(tableName, chunkColumnName, "id > 200"); + DataStreamSource source = + env.fromSource(sleepingSource, WatermarkStrategy.noWatermarks(), "selfSource"); + + String[] expectedSnapshotData = + new String[] { + "+I[1009, user_10, Shanghai, 123567891234]", + "+I[1010, user_11, Shanghai, 123567891234]", + "+I[1011, user_12, Shanghai, 123567891234]", + "+I[1012, user_13, Shanghai, 123567891234]", + "+I[1013, user_14, Shanghai, 123567891234]", + "+I[1014, user_15, Shanghai, 123567891234]", + "+I[1015, user_16, Shanghai, 123567891234]", + "+I[1016, user_17, Shanghai, 123567891234]", + "+I[1017, user_18, Shanghai, 123567891234]", + "+I[1018, user_19, Shanghai, 123567891234]", + "+I[1019, user_20, Shanghai, 123567891234]", + "+I[2000, user_21, Shanghai, 123567891234]" + }; + TypeSerializer serializer = + source.getTransformation().getOutputType().createSerializer(env.getConfig()); + String accumulatorName = "dataStreamCollect_" + UUID.randomUUID(); + CollectSinkOperatorFactory factory = + new CollectSinkOperatorFactory(serializer, accumulatorName); + CollectSinkOperator operator = (CollectSinkOperator) factory.getOperator(); + CollectResultIterator iterator = + new CollectResultIterator( + operator.getOperatorIdFuture(), + serializer, + accumulatorName, + env.getCheckpointConfig(), + 10000L); + CollectStreamSink sink = new CollectStreamSink(source, factory); + sink.name("Data stream collect sink"); + env.addOperator(sink.getTransformation()); + JobClient jobClient = env.executeAsync("snapshotSplitTest"); + iterator.setJobClient(jobClient); + JobID jobId = jobClient.getJobID(); + + // Trigger failover once some snapshot records has been sent by sleeping source + if (iterator.hasNext()) { + triggerFailover( + FailoverType.JM, + jobId, + miniClusterResource.get().getMiniCluster(), + () -> sleepMs(100)); + } + + // Check all snapshot records are sent with exactly-once semantics + assertEqualsInAnyOrder( + Arrays.asList(expectedSnapshotData), + fetchRowData(iterator, expectedSnapshotData.length)); + Assertions.assertThat(hasNextData(iterator)).isFalse(); + jobClient.cancel().get(); + } + + @Test + @SuppressWarnings({"rawtypes", "unchecked"}) + void testSnapshotFiltersMultipleTables() throws Exception { + customDatabase.createAndInitialize(); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(DEFAULT_PARALLELISM); + env.enableCheckpointing(5000L); + RestartStrategyUtils.configureFixedDelayRestartStrategy(env, 1, 0); + + ResolvedSchema physicalSchema = + new ResolvedSchema( + Arrays.asList( + Column.physical("id", DataTypes.BIGINT().notNull()), + Column.physical("name", DataTypes.STRING()), + Column.physical("address", DataTypes.STRING()), + Column.physical("phone_number", DataTypes.STRING())), + new ArrayList<>(), + UniqueConstraint.primaryKey("pk", Collections.singletonList("id"))); + RowType physicalDataType = + (RowType) physicalSchema.toPhysicalRowDataType().getLogicalType(); + final TypeInformation typeInfo = InternalTypeInfo.of(physicalDataType); + RowDataDebeziumDeserializeSchema deserializer = + RowDataDebeziumDeserializeSchema.newBuilder() + .setPhysicalRowType(physicalDataType) + .setMetadataConverters(new MetadataConverter[0]) + .setResultTypeInfo(typeInfo) + .setServerTimeZone(ZoneId.of("UTC")) + .setUserDefinedConverterFactory( + MySqlDeserializationConverterFactory.instance()) + .build(); + + // Apply different filters to customers (id > 200) and customers_1 (id < 200) + MySqlSource source = + MySqlSource.builder() + .hostname(MYSQL_CONTAINER.getHost()) + .port(MYSQL_CONTAINER.getDatabasePort()) + .databaseList(customDatabase.getDatabaseName()) + .tableList( + customDatabase.getDatabaseName() + ".customers", + customDatabase.getDatabaseName() + ".customers_1") + .username(customDatabase.getUsername()) + .password(customDatabase.getPassword()) + .serverTimeZone("UTC") + .serverId(getServerId()) + .splitSize(8096) + .fetchSize(1024) + .connectTimeout(Duration.ofSeconds(30)) + .debeziumProperties(new Properties()) + .startupOptions(StartupOptions.initial()) + .deserializer(deserializer) + .snapshotFilters( + customDatabase.getDatabaseName() + ".customers", "id > 200") + .snapshotFilters( + customDatabase.getDatabaseName() + ".customers_1", "id < 200") + .build(); + + DataStreamSource stream = + env.fromSource(source, WatermarkStrategy.noWatermarks(), "multiTableSource"); + + // customers with id > 200: 1009-1019, 2000 (12 rows) + // customers_1 with id < 200: 101, 102, 103, 109, 110, 111, 118, 121, 123 (9 rows) + int expectedCount = 12 + 9; + + TypeSerializer serializer = + stream.getTransformation().getOutputType().createSerializer(env.getConfig()); + String accumulatorName = "dataStreamCollect_" + UUID.randomUUID(); + CollectSinkOperatorFactory factory = + new CollectSinkOperatorFactory(serializer, accumulatorName); + CollectSinkOperator operator = (CollectSinkOperator) factory.getOperator(); + CollectResultIterator iterator = + new CollectResultIterator( + operator.getOperatorIdFuture(), + serializer, + accumulatorName, + env.getCheckpointConfig(), + 10000L); + CollectStreamSink sink = new CollectStreamSink(stream, factory); + sink.name("Data stream collect sink"); + env.addOperator(sink.getTransformation()); + JobClient jobClient = env.executeAsync("snapshotFiltersMultiTableTest"); + iterator.setJobClient(jobClient); + + List actual = fetchRowData(iterator, expectedCount); + // customers: id > 200 + assertThat(actual) + .containsAll( + Arrays.asList( + "+I[1009, user_10, Shanghai, 123567891234]", + "+I[1010, user_11, Shanghai, 123567891234]", + "+I[1011, user_12, Shanghai, 123567891234]", + "+I[1012, user_13, Shanghai, 123567891234]", + "+I[1013, user_14, Shanghai, 123567891234]", + "+I[1014, user_15, Shanghai, 123567891234]", + "+I[1015, user_16, Shanghai, 123567891234]", + "+I[1016, user_17, Shanghai, 123567891234]", + "+I[1017, user_18, Shanghai, 123567891234]", + "+I[1018, user_19, Shanghai, 123567891234]", + "+I[1019, user_20, Shanghai, 123567891234]", + "+I[2000, user_21, Shanghai, 123567891234]")); + // customers_1: id < 200 + assertThat(actual) + .containsAll( + Arrays.asList( + "+I[101, user_1, Shanghai, 123567891234]", + "+I[102, user_2, Shanghai, 123567891234]", + "+I[103, user_3, Shanghai, 123567891234]", + "+I[109, user_4, Shanghai, 123567891234]", + "+I[110, user_5, Shanghai, 123567891234]", + "+I[111, user_6, Shanghai, 123567891234]", + "+I[118, user_7, Shanghai, 123567891234]", + "+I[121, user_8, Shanghai, 123567891234]", + "+I[123, user_9, Shanghai, 123567891234]")); + Assertions.assertThat(hasNextData(iterator)).isFalse(); + jobClient.cancel().get(); + } + @ParameterizedTest @MethodSource("parameters") void testStartFromEarliestOffset(String tableName, String chunkColumnName) throws Exception { @@ -1041,7 +1226,8 @@ private CollectResultIterator addCollector( return iterator; } - private MySqlSource buildSleepingSource(String tableName, String chunkColumnName) { + private MySqlSource buildSleepingSource( + String tableName, String chunkColumnName, String snapshotFilter) { ResolvedSchema physicalSchema = new ResolvedSchema( Arrays.asList( @@ -1093,6 +1279,7 @@ private MySqlSource buildSleepingSource(String tableName, String chunkC .chunkKeyColumn( new ObjectPath(customDatabase.getDatabaseName(), tableName), chunkColumnName) + .snapshotFilters(customDatabase.getDatabaseName() + "." + tableName, snapshotFilter) .build(); } diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitterTest.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitterTest.java index 0e4f369bb81..98776ac9681 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitterTest.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/assigners/MySqlChunkSplitterTest.java @@ -135,7 +135,7 @@ public T prepareQueryAndMap( int chunkSize = 5; Object result = - splitter.nextChunkEnd(jdbc, previousChunkEnd, tableId, "id", max, chunkSize); + splitter.nextChunkEnd(jdbc, previousChunkEnd, tableId, "id", max, chunkSize, null); // when queryNextChunkMax returns null, nextChunkEnd should also return null // instead of propagating the null further and causing errors diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtilsTest.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtilsTest.java new file mode 100644 index 00000000000..00a2f7aea79 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/source/utils/SnapshotFilterUtilsTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.mysql.source.utils; + +import io.debezium.relational.TableId; +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +/** Unit test for {@link org.apache.flink.cdc.connectors.mysql.source.utils.SnapshotFilterUtils}. */ +public class SnapshotFilterUtilsTest { + + @Test + public void testGetSnapshotFilter() { + Map map = new HashMap<>(); + map.put("db.user", "id > 100"); + map.put("db.order_[0-9]+", "id > 200"); + Assertions.assertThat(SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.user"))) + .isEqualTo("id > 100"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.order_1"))) + .isEqualTo("id > 200"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.order_2"))) + .isEqualTo("id > 200"); + Assertions.assertThat(SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.shop"))) + .isNull(); + } + + @Test + public void testGetSnapshotFilterPreservesOrder() { + // Use LinkedHashMap to ensure deterministic order + Map map = new LinkedHashMap<>(); + map.put("db.table_a", "id > 100"); + map.put("db.table_b", "id > 200"); + map.put("db.table_c", "id > 300"); + + // Verify each table matches its corresponding filter + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.table_a"))) + .isEqualTo("id > 100"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.table_b"))) + .isEqualTo("id > 200"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(map, TableId.parse("db.table_c"))) + .isEqualTo("id > 300"); + + // Test with regex patterns - non-overlapping patterns + Map regexMap = new LinkedHashMap<>(); + regexMap.put("db.order_[0-9]+", "id > 100"); + regexMap.put("db.user_[0-9]+", "id > 200"); + regexMap.put("db.product_[0-9]+", "id > 300"); + + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter( + regexMap, TableId.parse("db.order_1"))) + .isEqualTo("id > 100"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter(regexMap, TableId.parse("db.user_2"))) + .isEqualTo("id > 200"); + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter( + regexMap, TableId.parse("db.product_3"))) + .isEqualTo("id > 300"); + + // Test with overlapping patterns - this is the critical case that reproduces the original + // bug + // Multiple patterns match the same table, should return the first one in insertion order + Map overlappingMap = new LinkedHashMap<>(); + overlappingMap.put("db.order_[0-9]+", "id > 100"); // Broader pattern - matches order_1 + overlappingMap.put("db.order_1", "id > 200"); // More specific - also matches order_1 + overlappingMap.put("db.order_[1-9]", "id > 300"); // Also matches order_1 + + // All three patterns match db.order_1, but should consistently return the first one + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter( + overlappingMap, TableId.parse("db.order_1"))) + .isEqualTo("id > 100"); // First pattern wins + + // Verify this is deterministic by calling multiple times + for (int i = 0; i < 10; i++) { + Assertions.assertThat( + SnapshotFilterUtils.getSnapshotFilter( + overlappingMap, TableId.parse("db.order_1"))) + .isEqualTo("id > 100"); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactoryTest.java b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactoryTest.java index 01c2dff84da..cd26e076157 100644 --- a/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactoryTest.java +++ b/flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc/src/test/java/org/apache/flink/cdc/connectors/mysql/table/MySqlTableSourceFactoryTest.java @@ -129,7 +129,8 @@ void testCommonProperties() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -179,7 +180,8 @@ void testEnableParallelReadSource() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -225,7 +227,8 @@ void testEnableParallelReadSourceWithSingleServerId() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -269,7 +272,8 @@ void testEnableParallelReadSourceLatestOffset() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -290,6 +294,8 @@ void testOptionalProperties() { options.put("scan.incremental.close-idle-reader.enabled", "true"); options.put("scan.incremental.snapshot.backfill.skip", "true"); options.put("use.legacy.json.format", "true"); + options.put("scan.incremental.snapshot.enabled", "true"); + options.put("scan.snapshot.filter", "id > 200"); DynamicTableSource actualSource = createTableSource(options); Properties dbzProperties = new Properties(); @@ -311,7 +317,7 @@ void testOptionalProperties() { ZoneId.of("Asia/Shanghai"), dbzProperties, "4321", - false, + true, SCAN_INCREMENTAL_SNAPSHOT_CHUNK_SIZE.defaultValue(), CHUNK_META_GROUP_SIZE.defaultValue(), SCAN_SNAPSHOT_FETCH_SIZE.defaultValue(), @@ -330,7 +336,8 @@ void testOptionalProperties() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), true, SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + "id > 200"); Assertions.assertThat(actualSource) .isEqualTo(expectedSource) .isInstanceOf(MySqlTableSource.class); @@ -389,7 +396,8 @@ void testStartupFromSpecificOffset() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -431,7 +439,8 @@ void testStartupFromInitial() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -474,7 +483,8 @@ void testStartupFromEarliestOffset() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -518,7 +528,8 @@ void testStartupFromSpecificTimestamp() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -560,7 +571,8 @@ void testStartupFromLatestOffset() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } @@ -607,7 +619,8 @@ void testMetadataColumns() { PARSE_ONLINE_SCHEMA_CHANGES.defaultValue(), USE_LEGACY_JSON_FORMAT.defaultValue(), SCAN_INCREMENTAL_SNAPSHOT_UNBOUNDED_CHUNK_FIRST.defaultValue(), - false); + false, + null); expectedSource.producedDataType = SCHEMA_WITH_METADATA.toSourceRowDataType(); expectedSource.metadataKeys = Arrays.asList("op_ts", "database_name"); @@ -768,6 +781,17 @@ void testValidation() { String.format( "The table-name '%s' is not a valid regular expression", "*_invalid_table")); + + // validate snapshot filter requires parallel read enabled + Assertions.assertThatThrownBy( + () -> { + Map properties = getAllOptions(); + properties.put("scan.incremental.snapshot.enabled", "false"); + properties.put("scan.snapshot.filter", "id > 100"); + createTableSource(properties); + }) + .hasStackTraceContaining( + "Option 'scan.snapshot.filter' can only be used when 'scan.incremental.snapshot.enabled' is enabled"); } @Test @@ -810,7 +834,8 @@ void testEnablingExperimentalOptions() { true, true, true, - false); + false, + null); Assertions.assertThat(actualSource).isEqualTo(expectedSource); } diff --git a/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MysqlE2eITCase.java b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MysqlE2eITCase.java index 7b551db8333..651f72e0789 100644 --- a/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MysqlE2eITCase.java +++ b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MysqlE2eITCase.java @@ -535,4 +535,54 @@ void testDanglingDropTableEventInBinlog() throws Exception { "CreateTableEvent{tableId=%s.products, schema=columns={`id` INT NOT NULL,`name` VARCHAR(255) NOT NULL 'flink',`description` VARCHAR(512),`weight` FLOAT,`enum_c` STRING 'red',`json_c` STRING,`point_c` STRING}, primaryKeys=id, options=()}", "DataChangeEvent{tableId=%s.products, before=[106, hammer, 16oz carpenter's hammer, 1.0, null, null, null], after=[106, hammer, 18oz carpenter hammer, 1.0, null, null, null], op=UPDATE, meta=()}"); } + + @Test + void testSnapshotFilters() throws Exception { + String pipelineJob = + String.format( + "source:\n" + + " type: mysql\n" + + " hostname: %s\n" + + " port: 3306\n" + + " username: %s\n" + + " password: %s\n" + + " tables: %s.\\.*\n" + + " server-id: 5400-5404\n" + + " server-time-zone: UTC\n" + + " scan.snapshot.filters:\n" + + " - table: %s.customers\n" + + " filter: id > 102\n" + + " - table: %s.products\n" + + " filter: id < 105\n" + + "\n" + + "sink:\n" + + " type: values\n" + + "\n" + + "pipeline:\n" + + " parallelism: %d", + INTER_CONTAINER_MYSQL_ALIAS, + MYSQL_TEST_USER, + MYSQL_TEST_PASSWORD, + mysqlInventoryDatabase.getDatabaseName(), + mysqlInventoryDatabase.getDatabaseName(), + mysqlInventoryDatabase.getDatabaseName(), + parallelism); + + submitPipelineJob(pipelineJob); + waitUntilJobRunning(Duration.ofSeconds(30)); + LOG.info("Pipeline job is running"); + + // customers: only id > 102 (id=103, 104) + // products: only id < 105 (id=101, 102, 103, 104) + validateResult( + dbNameFormatter, + "CreateTableEvent{tableId=%s.customers, schema=columns={`id` INT NOT NULL,`name` VARCHAR(255) NOT NULL 'flink',`address` VARCHAR(1024),`phone_number` VARCHAR(512)}, primaryKeys=id, options=()}", + "DataChangeEvent{tableId=%s.customers, before=[], after=[103, user_3, Shanghai, 123567891234], op=INSERT, meta=()}", + "DataChangeEvent{tableId=%s.customers, before=[], after=[104, user_4, Shanghai, 123567891234], op=INSERT, meta=()}", + "CreateTableEvent{tableId=%s.products, schema=columns={`id` INT NOT NULL,`name` VARCHAR(255) NOT NULL 'flink',`description` VARCHAR(512),`weight` FLOAT,`enum_c` STRING 'red',`json_c` STRING,`point_c` STRING}, primaryKeys=id, options=()}", + "DataChangeEvent{tableId=%s.products, before=[], after=[101, scooter, Small 2-wheel scooter, 3.14, red, {\"key1\": \"value1\"}, {\"coordinates\":[1,1],\"type\":\"Point\",\"srid\":0}], op=INSERT, meta=()}", + "DataChangeEvent{tableId=%s.products, before=[], after=[102, car battery, 12V car battery, 8.1, white, {\"key2\": \"value2\"}, {\"coordinates\":[2,2],\"type\":\"Point\",\"srid\":0}], op=INSERT, meta=()}", + "DataChangeEvent{tableId=%s.products, before=[], after=[103, 12-pack drill bits, 12-pack of drill bits with sizes ranging from #40 to #3, 0.8, red, {\"key3\": \"value3\"}, {\"coordinates\":[3,3],\"type\":\"Point\",\"srid\":0}], op=INSERT, meta=()}", + "DataChangeEvent{tableId=%s.products, before=[], after=[104, hammer, 12oz carpenter's hammer, 0.75, white, {\"key4\": \"value4\"}, {\"coordinates\":[4,4],\"type\":\"Point\",\"srid\":0}], op=INSERT, meta=()}"); + } }