-
Notifications
You must be signed in to change notification settings - Fork 49
maintainer: avoid panic when maintainer bootstrap #4518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
e18394d
aad433e
5c81cb5
53086f8
0086672
3a2ab29
f7ea48b
979c5de
473b746
6c07d9d
c0c382f
b6294e5
e5b53cc
c8549fd
a85b80a
eb2a4c9
6e89a30
6664e42
6b9a4e5
ecbe335
8008728
113f22f
e4f6c78
b9e3379
975bb8e
20c27fe
f777938
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -88,7 +88,10 @@ func (c *Controller) FinishBootstrap( | |||||||||||||||||||||||||||
| zap.Int("nodeCount", len(allNodesResp))) | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| // Step 1: Determine start timestamp and update DDL dispatcher | ||||||||||||||||||||||||||||
| startTs, redoStartTs := c.determineStartTs(allNodesResp) | ||||||||||||||||||||||||||||
| startTs, redoStartTs, err := c.determineStartTs(allNodesResp) | ||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||
| return nil, errors.Trace(err) | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
Comment on lines
+91
to
+96
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove panic on bootstrap start-ts resolution failure Line 93 still panics, so the maintainer can crash in the exact failure path this PR is trying to harden. Return the error instead of panicking, and propagate it without re-wrapping at Line 95. ✅ Suggested fix startTs, redoStartTs, err := c.determineStartTs(allNodesResp)
if err != nil {
- log.Panic("cant not found the startTs from the bootstrap response",
- zap.String("changefeed", c.changefeedID.Name()))
- return nil, errors.Trace(err)
+ log.Error("cannot determine start ts from bootstrap response",
+ zap.Stringer("changefeed", c.changefeedID),
+ zap.Error(err))
+ return nil, err
}📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| // Step 2: Load tables from schema store | ||||||||||||||||||||||||||||
| tables, err := c.loadTables(startTs) | ||||||||||||||||||||||||||||
|
|
@@ -144,7 +147,7 @@ func (c *Controller) FinishBootstrap( | |||||||||||||||||||||||||||
| }, nil | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.MaintainerBootstrapResponse) (uint64, uint64) { | ||||||||||||||||||||||||||||
| func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.MaintainerBootstrapResponse) (uint64, uint64, error) { | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| var ( | ||||||||||||||||||||||||||||
| startTs uint64 | ||||||||||||||||||||||||||||
| redoStartTs uint64 | ||||||||||||||||||||||||||||
|
|
@@ -170,14 +173,18 @@ func (c *Controller) determineStartTs(allNodesResp map[node.ID]*heartbeatpb.Main | |||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| if startTs == 0 { | ||||||||||||||||||||||||||||
| log.Panic("cant not found the startTs from the bootstrap response", | ||||||||||||||||||||||||||||
| zap.String("changefeed", c.changefeedID.Name())) | ||||||||||||||||||||||||||||
| return 0, 0, errors.WrapError( | ||||||||||||||||||||||||||||
| errors.ErrChangefeedInitTableTriggerDispatcherFailed, | ||||||||||||||||||||||||||||
| errors.New("all bootstrap responses reported empty checkpointTs"), | ||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||
|
Comment on lines
177
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| if c.enableRedo && redoStartTs == 0 { | ||||||||||||||||||||||||||||
| log.Panic("cant not found the redoStartTs from the bootstrap response", | ||||||||||||||||||||||||||||
| zap.String("changefeed", c.changefeedID.Name())) | ||||||||||||||||||||||||||||
| return 0, 0, errors.WrapError( | ||||||||||||||||||||||||||||
| errors.ErrChangefeedInitTableTriggerDispatcherFailed, | ||||||||||||||||||||||||||||
| errors.New("all bootstrap responses reported empty redoCheckpointTs"), | ||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
| return startTs, redoStartTs | ||||||||||||||||||||||||||||
| return startTs, redoStartTs, nil | ||||||||||||||||||||||||||||
|
wk989898 marked this conversation as resolved.
|
||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| func (c *Controller) buildWorkingTaskMap( | ||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ import ( | |
| appcontext "github.com/pingcap/ticdc/pkg/common/context" | ||
| commonEvent "github.com/pingcap/ticdc/pkg/common/event" | ||
| "github.com/pingcap/ticdc/pkg/config" | ||
| cerrors "github.com/pingcap/ticdc/pkg/errors" | ||
| "github.com/pingcap/ticdc/pkg/eventservice" | ||
| "github.com/pingcap/ticdc/pkg/node" | ||
| "github.com/pingcap/ticdc/pkg/pdutil" | ||
|
|
@@ -1434,6 +1435,38 @@ func TestFinishBootstrap(t *testing.T) { | |
| require.Nil(t, postBootstrapRequest) | ||
| } | ||
|
|
||
| func TestFinishBootstrapReturnsErrorWhenCheckpointMissing(t *testing.T) { | ||
| testutil.SetUpTestServices() | ||
| nodeManager := appcontext.GetService[*watcher.NodeManager](watcher.NodeManagerName) | ||
| nodeManager.GetAliveNodes()["node1"] = &node.Info{ID: "node1"} | ||
|
|
||
| tableTriggerEventDispatcherID := common.NewDispatcherID() | ||
| cfID := common.NewChangeFeedIDWithName("test", common.DefaultKeyspaceName) | ||
| ddlSpan := replica.NewWorkingSpanReplication(cfID, tableTriggerEventDispatcherID, | ||
| common.DDLSpanSchemaID, | ||
| common.KeyspaceDDLSpan(common.DefaultKeyspaceID), &heartbeatpb.TableSpanStatus{ | ||
| ID: tableTriggerEventDispatcherID.ToPB(), | ||
| ComponentStatus: heartbeatpb.ComponentState_Working, | ||
| CheckpointTs: 1, | ||
| }, "node1", false) | ||
| refresher := replica.NewRegionCountRefresher(cfID, time.Minute) | ||
| controller := NewController(cfID, 1, &mockThreadPool{}, | ||
| config.GetDefaultReplicaConfig(), ddlSpan, nil, 1000, 0, refresher, common.DefaultKeyspace, false) | ||
|
|
||
| postBootstrapRequest, err := controller.FinishBootstrap(map[node.ID]*heartbeatpb.MaintainerBootstrapResponse{ | ||
| "node1": { | ||
| ChangefeedID: cfID.ToPB(), | ||
| }, | ||
| }, false) | ||
| require.Nil(t, postBootstrapRequest) | ||
| require.Error(t, err) | ||
| code, ok := cerrors.RFCCode(err) | ||
| require.True(t, ok) | ||
| require.Equal(t, cerrors.ErrChangefeedInitTableTriggerDispatcherFailed.RFCCode(), code) | ||
| require.Contains(t, err.Error(), "all bootstrap responses reported empty checkpointTs") | ||
| require.False(t, controller.bootstrapped) | ||
|
Comment on lines
+1438
to
+1467
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new test case |
||
| } | ||
|
|
||
| // TestFinishBootstrapSkipsStaleCreateOperatorForDroppedTable covers stale bootstrap Create requests | ||
| // for dropped tables across add/move/split operator types. Each subtest boots from an empty schema | ||
| // snapshot and verifies bootstrap skips the stale create phase instead of recreating ghost tasks or | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| #!/bin/bash | ||
|
|
||
| set -eu | ||
|
|
||
| # This integration test covers a bootstrap retry sequence after the first bootstrap | ||
| # round has already failed and consumed the cached bootstrap responses. | ||
| # | ||
| # Steps: | ||
| # 1. Start a single TiCDC node. | ||
| # 2. Enable a one-shot failpoint so the first maintainer bootstrap fails while | ||
| # loading tables from schema store. | ||
| # 3. Create a blackhole changefeed and wait until the bootstrap failure is logged. | ||
| # 4. Start a second TiCDC node to trigger node change / bootstrap retry. | ||
| # 5. Verify the retry reaches the "empty checkpointTs" error path instead of panicking. | ||
|
|
||
| CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) | ||
| source $CUR/../_utils/test_prepare | ||
| WORK_DIR=$OUT_DIR/$TEST_NAME | ||
| CDC_BINARY=cdc.test | ||
| MAX_RETRIES=20 | ||
|
|
||
| PD_ADDR="http://${UP_PD_HOST_1}:${UP_PD_PORT_1}" | ||
| CHANGEFEED_ID="bootstrap-retry-after-error" | ||
| FAILPOINT_NAME="github.com/pingcap/ticdc/logservice/schemastore/getAllPhysicalTablesGCFastFail" | ||
|
|
||
| function check_changefeed_failed() { | ||
| local pd_addr=$1 | ||
| local changefeed_id=$2 | ||
| info=$(cdc_cli_changefeed query --pd=$pd_addr -c "$changefeed_id" -s | grep -v "Command to ticdc") | ||
| state=$(echo "$info" | jq -r '.state') | ||
| if [[ "$state" != "failed" ]]; then | ||
| echo "changefeed state $state does not equal to failed" | ||
| exit 1 | ||
| fi | ||
| } | ||
|
|
||
| function check_cdc_logs_not_contains() { | ||
| local work_dir=$1 | ||
| local pattern=$2 | ||
| if grep -Eqs "$pattern" "$work_dir"/cdc*.log; then | ||
| echo "TEST FAILED: CDC logs contain unexpected pattern '$pattern'" | ||
| grep -Ens "$pattern" "$work_dir"/cdc*.log || true | ||
| exit 1 | ||
| fi | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| export -f check_changefeed_failed | ||
| export -f check_cdc_logs_not_contains | ||
|
|
||
| function run() { | ||
| rm -rf $WORK_DIR && mkdir -p $WORK_DIR | ||
|
|
||
| start_tidb_cluster --workdir $WORK_DIR | ||
|
|
||
| run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "0" --addr "127.0.0.1:8300" --pd "$PD_ADDR" | ||
| enable_failpoint --addr "127.0.0.1:8300" --name "$FAILPOINT_NAME" --expr "1*return(true)" | ||
|
|
||
| cdc_cli_changefeed create --pd="$PD_ADDR" --sink-uri="blackhole://" -c "$CHANGEFEED_ID" | ||
|
|
||
| ensure $MAX_RETRIES "check_logs_contains $WORK_DIR 'ErrSnapshotLostByGC' '0'" | ||
|
|
||
| run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8301" --pd "$PD_ADDR" | ||
|
|
||
| ensure $MAX_RETRIES "check_logs_contains $WORK_DIR 'maintainer node changed' '0'" | ||
| ensure $MAX_RETRIES "check_logs_contains $WORK_DIR 'all bootstrap responses reported empty checkpointTs' '0'" | ||
| ensure $MAX_RETRIES "get_cdc_pid 127.0.0.1 8300 >/dev/null" | ||
| ensure $MAX_RETRIES "get_cdc_pid 127.0.0.1 8301 >/dev/null" | ||
| ensure $MAX_RETRIES "check_changefeed_failed $PD_ADDR $CHANGEFEED_ID" | ||
|
|
||
| check_cdc_logs_not_contains "$WORK_DIR" "cant not found the startTs from the bootstrap response|\\[PANIC\\]" | ||
|
|
||
| cleanup_process $CDC_BINARY | ||
| stop_tidb_cluster | ||
| } | ||
|
|
||
| trap 'stop_test $WORK_DIR' EXIT | ||
| run | ||
| check_logs $WORK_DIR | ||
| echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replacing
log.Panicwith an error return is a critical improvement for the robustness of the system. Panicking can lead to unexpected service interruptions, whereas returning an error allows for graceful handling and recovery. The error message also provides clear context about the issue.