-
Notifications
You must be signed in to change notification settings - Fork 49
maintainer: avoid panic when maintainer bootstrap #4518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 25 commits
e18394d
aad433e
5c81cb5
53086f8
0086672
3a2ab29
f7ea48b
979c5de
473b746
6c07d9d
c0c382f
b6294e5
e5b53cc
c8549fd
a85b80a
eb2a4c9
6e89a30
6664e42
6b9a4e5
ecbe335
8008728
113f22f
e4f6c78
b9e3379
975bb8e
20c27fe
f777938
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -88,7 +88,12 @@ func (c *Controller) FinishBootstrap( | |||||||||||||||||||||||||||
| zap.Int("nodeCount", len(allNodesResp))) | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| // Step 1: Determine start timestamp and update DDL dispatcher | ||||||||||||||||||||||||||||
| startTs, redoStartTs := c.determineStartTs(allNodesResp) | ||||||||||||||||||||||||||||
| startTs, redoStartTs, err := c.determineStartTs(allNodesResp) | ||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||
| log.Error("can not determine the startTs from the bootstrap response", | ||||||||||||||||||||||||||||
| zap.String("changefeed", c.changefeedID.Name()), zap.Error(err)) | ||||||||||||||||||||||||||||
| return nil, errors.Trace(err) | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
Comment on lines
+91
to
+96
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove panic on bootstrap start-ts resolution failure Line 93 still panics, so the maintainer can crash in the exact failure path this PR is trying to harden. Return the error instead of panicking, and propagate it without re-wrapping at Line 95. ✅ Suggested fix startTs, redoStartTs, err := c.determineStartTs(allNodesResp)
if err != nil {
- log.Panic("cant not found the startTs from the bootstrap response",
- zap.String("changefeed", c.changefeedID.Name()))
- return nil, errors.Trace(err)
+ log.Error("cannot determine start ts from bootstrap response",
+ zap.Stringer("changefeed", c.changefeedID),
+ zap.Error(err))
+ return nil, err
}📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| // Step 2: Load tables from schema store | ||||||||||||||||||||||||||||
| tables, err := c.loadTables(startTs) | ||||||||||||||||||||||||||||
|
|
@@ -144,7 +149,7 @@ func (c *Controller) FinishBootstrap( | |||||||||||||||||||||||||||
| }, nil | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.MaintainerBootstrapResponse) (uint64, uint64) { | ||||||||||||||||||||||||||||
| func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.MaintainerBootstrapResponse) (uint64, uint64, error) { | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| var ( | ||||||||||||||||||||||||||||
| startTs uint64 | ||||||||||||||||||||||||||||
| redoStartTs uint64 | ||||||||||||||||||||||||||||
|
|
@@ -170,14 +175,18 @@ func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.Main | |||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| if startTs == 0 { | ||||||||||||||||||||||||||||
| log.Panic("cant not found the startTs from the bootstrap response", | ||||||||||||||||||||||||||||
| zap.String("changefeed", c.changefeedID.Name())) | ||||||||||||||||||||||||||||
| return 0, 0, errors.WrapError( | ||||||||||||||||||||||||||||
| errors.ErrChangefeedInitTableTriggerDispatcherFailed, | ||||||||||||||||||||||||||||
| errors.New("all bootstrap responses reported empty checkpointTs"), | ||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||
|
Comment on lines
177
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| if c.enableRedo && redoStartTs == 0 { | ||||||||||||||||||||||||||||
| log.Panic("cant not found the redoStartTs from the bootstrap response", | ||||||||||||||||||||||||||||
| zap.String("changefeed", c.changefeedID.Name())) | ||||||||||||||||||||||||||||
| return 0, 0, errors.WrapError( | ||||||||||||||||||||||||||||
| errors.ErrChangefeedInitTableTriggerDispatcherFailed, | ||||||||||||||||||||||||||||
| errors.New("all bootstrap responses reported empty redoCheckpointTs"), | ||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| return startTs, redoStartTs | ||||||||||||||||||||||||||||
| return startTs, redoStartTs, nil | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| func (c *Controller) buildWorkingTaskMap( | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ | |
| appcontext "github.com/pingcap/ticdc/pkg/common/context" | ||
| commonEvent "github.com/pingcap/ticdc/pkg/common/event" | ||
| "github.com/pingcap/ticdc/pkg/config" | ||
| cerrors "github.com/pingcap/ticdc/pkg/errors" | ||
| "github.com/pingcap/ticdc/pkg/eventservice" | ||
| "github.com/pingcap/ticdc/pkg/node" | ||
| "github.com/pingcap/ticdc/pkg/pdutil" | ||
|
|
@@ -1434,6 +1435,38 @@ | |
| require.Nil(t, postBootstrapRequest) | ||
| } | ||
|
|
||
| func TestFinishBootstrapReturnsErrorWhenCheckpointMissing(t *testing.T) { | ||
| testutil.SetUpTestServices() | ||
| nodeManager := appcontext.GetService[*watcher.NodeManager](watcher.NodeManagerName) | ||
| nodeManager.GetAliveNodes()["node1"] = &node.Info{ID: "node1"} | ||
|
|
||
| tableTriggerEventDispatcherID := common.NewDispatcherID() | ||
| cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) | ||
| ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, | ||
| common.DDLSpanSchemaID, | ||
| common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ | ||
| ID: tableTriggerEventDispatcherID.ToPB(), | ||
| ComponentStatus: heartbeatpb.ComponentState_Working, | ||
| CheckpointTs: 1, | ||
| }, "node1", false) | ||
| refresher := replica.NewRegionCountRefresher(cfID, time.Minute) | ||
| controller := NewController(cfID, 1, &mockThreadPool{}, | ||
| config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false) | ||
|
|
||
| postBootstrapRequest, err := controller.FinishBootstrap(map[node.ID]*heartbeatpb.MaintainerBootstrapResponse{ | ||
| "node1": { | ||
| ChangefeedID: cfID.ToPB(), | ||
| }, | ||
| }, false) | ||
| require.Nil(t, postBootstrapRequest) | ||
| require.Error(t, err) | ||
| code, ok := cerrors.RFCCode(err) | ||
| require.True(t, ok) | ||
| require.Equal(t, cerrors.ErrChangefeedInitTableTriggerDispatcherFailed.RFCCode(), code) | ||
| require.Contains(t, err.Error(), "all bootstrap responses reported empty checkpointTs") | ||
| require.False(t, controller.bootstrapped) | ||
|
Comment on lines
+1438
to
+1467
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new test case |
||
| } | ||
|
|
||
| // TestFinishBootstrapSkipsStaleCreateOperatorForDroppedTable covers stale bootstrap Create requests | ||
| // for dropped tables across add/move/split operator types. Each subtest boots from an empty schema | ||
| // snapshot and verifies bootstrap skips the stale create phase instead of recreating ghost tasks or | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| # diff Configuration. | ||
|
|
||
| check-thread-count = 4 | ||
|
|
||
| export-fix-sql = true | ||
|
|
||
| check-struct-only = false | ||
|
|
||
| [task] | ||
| output-dir = "/tmp/tidb_cdc_test/bootstrap_retry_after_error/output" | ||
|
|
||
| source-instances = ["mysql1"] | ||
|
|
||
| target-instance = "tidb0" | ||
|
|
||
| target-check-tables = ["test.?*"] | ||
|
|
||
| [data-sources] | ||
| [data-sources.mysql1] | ||
| host = "127.0.0.1" | ||
| port = 4000 | ||
| user = "root" | ||
| password = "" | ||
|
|
||
| [data-sources.tidb0] | ||
| host = "127.0.0.1" | ||
| port = 3306 | ||
| user = "root" | ||
| password = "" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| #!/bin/bash | ||
|
|
||
| set -eu | ||
|
|
||
| # This integration test covers bootstrap retry handling after an initial | ||
| # bootstrap failure. | ||
| # | ||
| # Steps: | ||
| # 1. Start one TiCDC node with a schema store failpoint that keeps bootstrap | ||
| # failing with ErrSnapshotLostByGC on the maintainer node. | ||
| # 2. Create a mysql sink changefeed and wait until bootstrap fails with | ||
| # ErrSnapshotLostByGC. | ||
| # 3. Start a second TiCDC node immediately after the first bootstrap error so | ||
| # the failed maintainer still observes node scheduling and processes another | ||
| # bootstrap response. | ||
| # 4. Verify logs contain the retry path: | ||
| # maintainer node changed -> bootstrap response -> handle bootstrap response. | ||
| # 5. Verify both TiCDC servers keep running and the changefeed remains failed | ||
| # with ErrSnapshotLostByGC. | ||
|
|
||
| CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) | ||
| REPO_ROOT=$(cd "$CUR/../../.." && pwd) | ||
| source $CUR/../_utils/test_prepare | ||
| WORK_DIR=$OUT_DIR/$TEST_NAME | ||
| CDC_BINARY=cdc.test | ||
| SINK_TYPE=$1 | ||
| MAX_RETRIES=20 | ||
| FAILPOINT_BLOCK_BEFORE_STOP_CHANGEFEED="github.com/pingcap/ticdc/coordinator/BlockBeforeStopChangefeed" | ||
|
|
||
| PD_ADDR="http://${UP_PD_HOST_1}:${UP_PD_PORT_1}" | ||
| SINK_URI="mysql://normal:123456@127.0.0.1:3306/" | ||
|
|
||
| function check_node_change_triggers_bootstrap() { | ||
| local work_dir=$1 | ||
| local file | ||
|
|
||
| for file in "$work_dir"/cdc*.log; do | ||
| if [ ! -f "$file" ]; then | ||
| continue | ||
| fi | ||
| if awk ' | ||
| /maintainer node changed/ { | ||
| nodeChanged = 1 | ||
| gotResp = 0 | ||
| handled = 0 | ||
| } | ||
| nodeChanged && /maintainer received bootstrap response/ { | ||
| gotResp = 1 | ||
| } | ||
| nodeChanged && gotResp && /handle bootstrap response/ { | ||
| handled = 1 | ||
| exit 0 | ||
| } | ||
| END { | ||
| exit handled ? 0 : 1 | ||
| } | ||
| ' "$file"; then | ||
| return 0 | ||
| fi | ||
| done | ||
|
|
||
| return 1 | ||
| } | ||
|
|
||
| export -f check_node_change_triggers_bootstrap | ||
|
|
||
| function run() { | ||
| if [ "$SINK_TYPE" != "mysql" ]; then | ||
| return | ||
| fi | ||
|
|
||
| rm -rf $WORK_DIR && mkdir -p $WORK_DIR | ||
|
|
||
| start_tidb_cluster --workdir $WORK_DIR | ||
|
|
||
| export GO_FAILPOINTS='github.com/pingcap/ticdc/logservice/schemastore/getAllPhysicalTablesGCFastFail=return(true)' | ||
| run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "0" | ||
| # wait for 5 seconds to let the server start and hit the failpoint | ||
| sleep 20 | ||
| enable_failpoint --addr "127.0.0.1:8300" --name "$FAILPOINT_BLOCK_BEFORE_STOP_CHANGEFEED" --expr "pause" | ||
|
|
||
| run_sql "CREATE DATABASE bootstrap_retry_after_error;" ${UP_TIDB_HOST} ${UP_TIDB_PORT} | ||
| run_sql "CREATE TABLE bootstrap_retry_after_error.t1(id INT PRIMARY KEY, val INT);" ${UP_TIDB_HOST} ${UP_TIDB_PORT} | ||
| run_sql "CREATE DATABASE bootstrap_retry_after_error;" ${DOWN_TIDB_HOST} ${DOWN_TIDB_PORT} | ||
| run_sql "CREATE TABLE bootstrap_retry_after_error.t1(id INT PRIMARY KEY, val INT);" ${DOWN_TIDB_HOST} ${DOWN_TIDB_PORT} | ||
|
|
||
| cdc_cli_changefeed create --sink-uri="$SINK_URI" -c "test" | ||
|
|
||
| ensure $MAX_RETRIES "grep -Eq 'ErrSnapshotLostByGC' $WORK_DIR/cdc*.log" | ||
|
|
||
| # Start the second node without the schema-store failpoint. The retry still | ||
| # fails because the maintainer is running on the first node, which keeps the | ||
| # bootstrap error active while we verify node scheduling triggers another | ||
| # bootstrap round. | ||
| export GO_FAILPOINTS='' | ||
|
|
||
| run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8301" --pd "$PD_ADDR" | ||
|
|
||
| ensure $MAX_RETRIES "check_node_change_triggers_bootstrap $WORK_DIR" | ||
| disable_failpoint --addr "127.0.0.1:8300" --name "$FAILPOINT_BLOCK_BEFORE_STOP_CHANGEFEED" | ||
| ensure $MAX_RETRIES "get_cdc_pid 127.0.0.1 8300 >/dev/null" | ||
| ensure $MAX_RETRIES "get_cdc_pid 127.0.0.1 8301 >/dev/null" | ||
| # sleep for a while to let the logs flush | ||
| sleep 10 | ||
| ensure $MAX_RETRIES "check_changefeed_state $PD_ADDR test failed ErrSnapshotLostByGC ''" | ||
| } | ||
|
|
||
| trap 'stop_test $WORK_DIR' EXIT | ||
| run $* | ||
| check_logs $WORK_DIR | ||
| echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replacing
log.Panicwith an error return is a critical improvement for the robustness of the system. Panicking can lead to unexpected service interruptions, whereas returning an error allows for graceful handling and recovery. The error message also provides clear context about the issue.