apache · leonardBang · Jul 24, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/docs/content/docs/connectors/flink-sources/mysql-cdc.md b/docs/content/docs/connectors/flink-sources/mysql-cdc.md
@@ -523,7 +523,7 @@ Incremental snapshot reading is a new mechanism to read snapshot of a table. Com
 If you would like the source run in parallel, each parallel reader should have an unique server id, so the 'server-id' must be a range like '5400-6400', 
 and the range must be larger than the parallelism.
 
-During the incremental snapshot reading, the MySQL CDC Source firstly splits snapshot chunks (splits) by primary key of table,
+During the incremental snapshot reading, the MySQL CDC Source firstly splits snapshot chunks (splits) by user specified chunk key of table,
 and then MySQL CDC Source assigns the chunks to multiple readers to read the data of snapshot chunk.
 
 #### Controlling Parallelism
@@ -573,7 +573,7 @@ The CDC job may restart fails in this case. So the heartbeat event will help upd
 
 When the MySQL CDC source is started, it reads snapshot of table parallelly and then reads binlog of table with single parallelism.
 
-In snapshot phase, the snapshot is cut into multiple snapshot chunks according to primary key of table and the size of table rows.
+In snapshot phase, the snapshot is cut into multiple snapshot chunks according to chunk key of table and the size of table rows.
 Snapshot chunks is assigned to multiple snapshot readers. Each snapshot reader reads its received chunks with [chunk reading algorithm](#snapshot-chunk-reading) and send the read data to downstream.
 The source manages the process status (finished or not) of chunks, thus the source of snapshot phase can support checkpoint in chunk level.
 If a failure happens, the source can be restored and continue to read chunks from last finished chunks.
@@ -589,7 +589,9 @@ Flink performs checkpoints for the source periodically, in case of failover, the
 
 When performing incremental snapshot reading, MySQL CDC source need a criterion which used to split the table.
 MySQL CDC Source use a splitting column to split the table to multiple splits (chunks). By default, MySQL CDC source will identify the primary key column of the table and use the first column in primary key as the splitting column.
-If there is no primary key in the table, incremental snapshot reading will fail and you can disable `scan.incremental.snapshot.enabled` to fallback to old snapshot reading mechanism.
+If there is no primary key in the table, user must specify `scan.incremental.snapshot.chunk.key-column`, 
+otherwise incremental snapshot reading will fail and you can disable `scan.incremental.snapshot.enabled` to fallback to old snapshot reading mechanism.
+Please note that using a column not in primary key as a chunk key can result in slower table query performance.
 
 For numeric and auto incremental splitting column, MySQL CDC Source efficiently splits chunks by fixed step length.
 For example, if you had a table with a primary key column of `id` which is auto-incremental BIGINT type, the minimum value was `0` and maximum value was `100`,

diff --git a/...dc-base/src/main/java/org/apache/flink/cdc/connectors/base/options/JdbcSourceOptions.java b/...dc-base/src/main/java/org/apache/flink/cdc/connectors/base/options/JdbcSourceOptions.java
@@ -109,6 +109,5 @@ public class JdbcSourceOptions extends SourceOptions {
                     .noDefaultValue()
                     .withDescription(
                             "The chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table."
-                                    + "By default, the chunk key is the first column of the primary key and the chunk key is the RowId in oracle."
-                                    + "This column must be a column of the primary key.");
+                                    + "By default, the chunk key is the first column of the primary key and the chunk key is the RowId in oracle.");
 }
diff --git a/...src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java b/...src/main/java/org/apache/flink/cdc/connectors/mysql/source/config/MySqlSourceOptions.java
@@ -243,8 +243,7 @@ public class MySqlSourceOptions {
                     .noDefaultValue()
                     .withDescription(
                             "The chunk key of table snapshot, captured tables are split into multiple chunks by a chunk key when read the snapshot of table."
-                                    + "By default, the chunk key is the first column of the primary key."
-                                    + "This column must be a column of the primary key.");
+                                    + "By default, the chunk key is the first column of the primary key.");
 
     @Experimental
     public static final ConfigOption<Boolean> SCAN_INCREMENTAL_CLOSE_IDLE_READER_ENABLED =

diff --git a/...ysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/ChunkUtils.java b/...ysql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/ChunkUtils.java
@@ -57,9 +57,8 @@ public static RowType getChunkKeyColumnType(Column chunkKeyColumn) {
 
     /**
      * Get the chunk key column. This column could be set by `chunkKeyColumn`. If the table doesn't
-     * have primary keys, `chunkKeyColumn` must be set. If the table has primary keys,
-     * `chunkKeyColumn` must be a column of them or else null. When the parameter `chunkKeyColumn`
-     * is not set and the table has primary keys, return the first column of primary keys.
+     * have primary keys, `chunkKeyColumn` must be set. When the parameter `chunkKeyColumn` is not
+     * set and the table has primary keys, return the first column of primary keys.
      */
     public static Column getChunkKeyColumn(Table table, Map<ObjectPath, String> chunkKeyColumns) {
         List<Column> primaryKeys = table.primaryKeyColumns();
@@ -68,7 +67,8 @@ public static Column getChunkKeyColumn(Table table, Map<ObjectPath, String> chun
             throw new ValidationException(
                     "'scan.incremental.snapshot.chunk.key-column' must be set when the table doesn't have primary keys.");
         }
-        List<Column> searchColumns = primaryKeys.isEmpty() ? table.columns() : primaryKeys;
+
+        List<Column> searchColumns = table.columns();
         if (chunkKeyColumn != null) {
             Optional<Column> targetColumn =
                     searchColumns.stream()
@@ -79,9 +79,8 @@ public static Column getChunkKeyColumn(Table table, Map<ObjectPath, String> chun
             }
             throw new ValidationException(
                     String.format(
-                            "Chunk key column '%s' doesn't exist in the %s [%s] of the table %s.",
+                            "Chunk key column '%s' doesn't exist in the columns [%s] of the table %s.",
                             chunkKeyColumn,
-                            primaryKeys.isEmpty() ? "user specified columns" : "primary keys",
                             searchColumns.stream()
                                     .map(Column::name)
                                     .collect(Collectors.joining(",")),

diff --git a/...sql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/RecordUtils.java b/...sql-cdc/src/main/java/org/apache/flink/cdc/connectors/mysql/source/utils/RecordUtils.java
@@ -82,12 +82,7 @@ public static Object[] rowToArray(ResultSet rs, int size) throws SQLException {
     }
 
     public static Struct getStructContainsChunkKey(SourceRecord record) {
-        // If the table has primary keys, chunk key is in the record key struct
-        if (record.key() != null) {
-            return (Struct) record.key();
-        }
-
-        // If the table doesn't have primary keys, chunk key is in the after struct for insert or
+        // Use chunk key in the after struct for insert or
         // the before struct for delete/update
         Envelope.Operation op = Envelope.operationFor(record);
         Struct value = (Struct) record.value();
@@ -109,9 +104,9 @@ public static void upsertBinlog(
         if (isDataChangeRecord(binlogRecord)) {
             Struct value = (Struct) binlogRecord.value();
             if (value != null) {
-                Struct keyStruct = getStructContainsChunkKey(binlogRecord);
+                Struct chunkKeyStruct = getStructContainsChunkKey(binlogRecord);
                 if (splitKeyRangeContains(
-                        getSplitKey(splitBoundaryType, nameAdjuster, keyStruct),
+                        getSplitKey(splitBoundaryType, nameAdjuster, chunkKeyStruct),
                         splitStart,
                         splitEnd)) {
                     boolean hasPrimaryKey = binlogRecord.key() != null;
@@ -124,7 +119,7 @@ public static void upsertBinlog(
                                     snapshotRecords,
                                     binlogRecord,
                                     hasPrimaryKey
-                                            ? keyStruct
+                                            ? (Struct) binlogRecord.key()
                                             : createReadOpValue(
                                                     binlogRecord, Envelope.FieldName.AFTER),
                                     false);
@@ -152,15 +147,15 @@ public static void upsertBinlog(
                             upsertBinlog(
                                     snapshotRecords,
                                     binlogRecord,
-                                    hasPrimaryKey ? keyStruct : structFromAfter,
+                                    hasPrimaryKey ? (Struct) binlogRecord.key() : structFromAfter,
                                     false);
                             break;
                         case DELETE:
                             upsertBinlog(
                                     snapshotRecords,
                                     binlogRecord,
                                     hasPrimaryKey
-                                            ? keyStruct
+                                            ? (Struct) binlogRecord.key()
                                             : createReadOpValue(
                                                     binlogRecord, Envelope.FieldName.BEFORE),
                                     true);

diff --git a/...rg/apache/flink/cdc/connectors/mysql/source/assigners/MySqlSnapshotSplitAssignerTest.java b/...rg/apache/flink/cdc/connectors/mysql/source/assigners/MySqlSnapshotSplitAssignerTest.java
@@ -166,7 +166,7 @@ public void testAssignCompositePkTableWithWrongChunkKeyColumn() {
             assertTrue(
                     ExceptionUtils.findThrowableWithMessage(
                                     t,
-                                    "Chunk key column 'errorCol' doesn't exist in the primary keys [card_no,level] of the table")
+                                    "Chunk key column 'errorCol' doesn't exist in the columns [card_no,level,name,note] of the table")
                             .isPresent());
         }
     }
@@ -416,6 +416,28 @@ public void testTableWithoutPrimaryKey() {
         }
     }
 
+    @Test
+    public void testAssignTableWithoutPrimaryKeyWithChunkKeyColumn() {
+        String tableWithoutPrimaryKey = "customers_no_pk";
+        List<String> expected =
+                Arrays.asList(
+                        "customers_no_pk null [462]",
+                        "customers_no_pk [462] [823]",
+                        "customers_no_pk [823] [1184]",
+                        "customers_no_pk [1184] [1545]",
+                        "customers_no_pk [1545] [1906]",
+                        "customers_no_pk [1906] null");
+        List<String> splits =
+                getTestAssignSnapshotSplits(
+                        customerDatabase,
+                        4,
+                        CHUNK_KEY_EVEN_DISTRIBUTION_FACTOR_UPPER_BOUND.defaultValue(),
+                        CHUNK_KEY_EVEN_DISTRIBUTION_FACTOR_LOWER_BOUND.defaultValue(),
+                        new String[] {tableWithoutPrimaryKey},
+                        "id");
+        assertEquals(expected, splits);
+    }
+
     @Test
     public void testEnumerateTablesLazily() {
         final MySqlSourceConfig configuration =