-
Notifications
You must be signed in to change notification settings - Fork 298
codec(ticdc): support header line for CSV protocol (#12183) #12433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: release-8.5
Are you sure you want to change the base?
Changes from 7 commits
3f6fa0c
36ff62b
7f97ef3
7123e3e
f86777e
185f024
e1f0da7
9e625f4
9ed650b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,13 +18,15 @@ import ( | |
| "io" | ||
|
|
||
| "github.com/pingcap/errors" | ||
| "github.com/pingcap/log" | ||
| lconfig "github.com/pingcap/tidb/pkg/lightning/config" | ||
| "github.com/pingcap/tidb/pkg/lightning/mydump" | ||
| "github.com/pingcap/tidb/pkg/lightning/worker" | ||
| "github.com/pingcap/tiflow/cdc/model" | ||
| cerror "github.com/pingcap/tiflow/pkg/errors" | ||
| "github.com/pingcap/tiflow/pkg/sink/codec" | ||
| "github.com/pingcap/tiflow/pkg/sink/codec/common" | ||
| "go.uber.org/zap" | ||
| ) | ||
|
|
||
| const defaultIOConcurrency = 1 | ||
|
|
@@ -52,11 +54,13 @@ func NewBatchDecoder(ctx context.Context, | |
| backslashEscape = true | ||
| } | ||
| cfg := &lconfig.CSVConfig{ | ||
| Separator: codecConfig.Delimiter, | ||
| Delimiter: codecConfig.Quote, | ||
| Terminator: codecConfig.Terminator, | ||
| Null: []string{codecConfig.NullString}, | ||
| BackslashEscape: backslashEscape, | ||
| Separator: codecConfig.Delimiter, | ||
| Delimiter: codecConfig.Quote, | ||
| Terminator: codecConfig.Terminator, | ||
| Null: []string{codecConfig.NullString}, | ||
| BackslashEscape: backslashEscape, | ||
| HeaderSchemaMatch: true, | ||
| Header: codecConfig.CSVOutputFieldHeader, | ||
| } | ||
| csvParser, err := mydump.NewCSVParser(ctx, cfg, | ||
| mydump.NewStringReader(string(value)), | ||
|
|
@@ -65,6 +69,21 @@ func NewBatchDecoder(ctx context.Context, | |
| if err != nil { | ||
| return nil, err | ||
| } | ||
| if codecConfig.CSVOutputFieldHeader { | ||
| err := csvParser.ReadColumns() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| header := csvParser.Columns() | ||
| log.Info("parser CSV header", zap.Any("header", header), zap.Any("cap", cap(header))) | ||
| // check column name | ||
| idx := len(header) - len(tableInfo.Columns) | ||
| for i, col := range tableInfo.Columns { | ||
| if col.Name.O != header[idx+i] { | ||
| log.Panic("check column name order failed", zap.Any("col", col.Name.O), zap.Any("header", header[idx+i])) | ||
| } | ||
|
||
| } | ||
| } | ||
| return &batchDecoder{ | ||
| codecConfig: codecConfig, | ||
| tableInfo: tableInfo, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -100,7 +100,8 @@ func newCSVMessage(config *common.Config) *csvMessage { | |
| // Col2: Table name, the name of the source table. | ||
| // Col3: Schema name, the name of the source schema. | ||
| // Col4: Commit TS, the commit-ts of the source txn (optional). | ||
| // Col5-n: one or more columns that represent the data to be changed. | ||
| // Column 5: The is-update column only exists when the value of output-old-value is true.(optional) | ||
| // Col6-n: one or more columns that represent the data to be changed. | ||
| func (c *csvMessage) encode() []byte { | ||
| strBuilder := new(strings.Builder) | ||
| if c.opType == operationUpdate && c.config.OutputOldValue && len(c.preColumns) != 0 { | ||
|
|
@@ -116,7 +117,7 @@ func (c *csvMessage) encode() []byte { | |
| c.encodeMeta(c.opType.String(), strBuilder) | ||
| c.encodeColumns(c.columns, strBuilder) | ||
| } | ||
| return []byte(strBuilder.String()) | ||
| return common.UnsafeStringToBytes(strBuilder.String()) | ||
| } | ||
|
|
||
| func (c *csvMessage) encodeMeta(opType string, b *strings.Builder) { | ||
|
|
@@ -486,3 +487,32 @@ func csvColumns2RowChangeColumns(csvConfig *common.Config, csvCols []any, ticols | |
|
|
||
| return cols, nil | ||
| } | ||
|
|
||
| // The header should contain the name corresponding to the file record field, | ||
| // and should have the same number as the record field. | ||
| // | ticdc-meta$operation | ticdc-meta$table | ticdc-meta$schema | ticdc-meta$commit-ts | ticdc-meta$is-update | col1 | col2 | ... | | ||
| func encodeHeader(config *common.Config, colNames []string) []byte { | ||
| if !config.CSVOutputFieldHeader { | ||
| return nil | ||
| } | ||
| strBuilder := new(strings.Builder) | ||
| strBuilder.WriteString("ticdc-meta$operation") | ||
| strBuilder.WriteString(config.Delimiter) | ||
| strBuilder.WriteString("ticdc-meta$table") | ||
| strBuilder.WriteString(config.Delimiter) | ||
| strBuilder.WriteString("ticdc-meta$schema") | ||
| if config.IncludeCommitTs { | ||
| strBuilder.WriteString(config.Delimiter) | ||
| strBuilder.WriteString("ticdc-meta$commit-ts") | ||
| } | ||
| if config.OutputOldValue { | ||
| strBuilder.WriteString(config.Delimiter) | ||
| strBuilder.WriteString("ticdc-meta$is-update") | ||
| } | ||
| for _, name := range colNames { | ||
|
Comment on lines
+511
to
+512
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The }
if config.OutputHandleKey {
strBuilder.WriteString(config.Delimiter)
strBuilder.WriteString("ticdc-meta$handle-key")
}
for _, name := range colNames { |
||
| strBuilder.WriteString(config.Delimiter) | ||
| strBuilder.WriteString(name) | ||
| } | ||
| strBuilder.WriteString(config.Terminator) | ||
| return common.UnsafeStringToBytes(strBuilder.String()) | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -26,6 +26,13 @@ function run() { | |||||||||||||||
| run_storage_consumer $WORK_DIR $SINK_URI $CUR/conf/changefeed.toml "" | ||||||||||||||||
| sleep 8 | ||||||||||||||||
| check_sync_diff $WORK_DIR $CUR/conf/diff_config.toml 100 | ||||||||||||||||
| # check csv header | ||||||||||||||||
| find "$WORK_DIR/storage_test/" -type f -name "*.csv" | while read -r file; do | ||||||||||||||||
| first_line=$(head -n 1 $file) | ||||||||||||||||
| if [[ "$first_line" != ticdc-meta* ]]; then | ||||||||||||||||
| echo "check CSV header failed. header: $first_line" | ||||||||||||||||
| fi | ||||||||||||||||
|
Comment on lines
+32
to
+34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check for the CSV header is not strict enough. It only echoes a message on failure and does not cause the script to exit with an error. This could lead to test failures being missed. The check should be made stricter to exit on failure. Also, it's good practice to quote file paths (e.g.,
Suggested change
|
||||||||||||||||
| done | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| trap stop_tidb_cluster EXIT | ||||||||||||||||
|
|
||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment
// There is always only one message here in task.msgsis an important assumption. While it might be true for the current implementation, it could become a source of bugs if the logic for batching messages into tasks changes in the future. If a task could contain multiple messages, this logic would only write the header for the very first message in the task, potentially missing headers for subsequent messages if they were intended to start new files. Consider adding a more detailed explanation or a link to where this assumption is guaranteed.