@@ -23,6 +23,7 @@ import (
2323 "github.com/pingcap/ticdc/heartbeatpb"
2424 "github.com/pingcap/ticdc/pkg/common"
2525 "github.com/pingcap/ticdc/pkg/config"
26+ "github.com/pingcap/ticdc/pkg/errors"
2627 "github.com/pingcap/ticdc/pkg/liveness"
2728 "github.com/pingcap/ticdc/pkg/messaging"
2829 "github.com/pingcap/ticdc/pkg/node"
@@ -71,8 +72,7 @@ func (m *Manager) onAddMaintainerRequest(req *heartbeatpb.AddMaintainerRequest)
7172 return nil
7273 }
7374
74- target , epoch := m .getDispatcherDrainTarget ()
75- return m .maintainers .handleAddMaintainer (req , target , epoch )
75+ return m .maintainers .handleAddMaintainer (req , m .getDispatcherDrainTarget )
7676}
7777
7878// onRemoveMaintainerRequest delegates changefeed removal to the maintainer part.
@@ -86,10 +86,20 @@ func (m *Manager) onDispatchMaintainerRequest(
8686 msg * messaging.TargetMessage ,
8787) * heartbeatpb.MaintainerStatus {
8888 if m .coordinatorID != msg .From {
89- log .Warn ("ignore invalid coordinator id" ,
90- zap .Any ("request" , msg ),
91- zap .Any ("coordinatorID" , m .coordinatorID ),
92- zap .Stringer ("from" , msg .From ))
89+ fields := []zap.Field {
90+ zap .String ("type" , msg .Type .String ()),
91+ zap .Stringer ("coordinatorID" , m .coordinatorID ),
92+ zap .Stringer ("from" , msg .From ),
93+ }
94+ switch msg .Type {
95+ case messaging .TypeAddMaintainerRequest :
96+ changefeedID := common .NewChangefeedIDFromPB (msg .Message [0 ].(* heartbeatpb.AddMaintainerRequest ).Id )
97+ fields = append (fields , zap .Stringer ("changefeedID" , changefeedID ))
98+ case messaging .TypeRemoveMaintainerRequest :
99+ changefeedID := common .NewChangefeedIDFromPB (msg .Message [0 ].(* heartbeatpb.RemoveMaintainerRequest ).Id )
100+ fields = append (fields , zap .Stringer ("changefeedID" , changefeedID ))
101+ }
102+ log .Warn ("ignore invalid coordinator id" , fields ... )
93103 return nil
94104 }
95105 switch msg .Type {
@@ -155,33 +165,42 @@ func (p *managerMaintainerSet) buildBootstrapResponse() *heartbeatpb.Coordinator
155165// with the latest node-scoped dispatcher drain target.
156166func (p * managerMaintainerSet ) handleAddMaintainer (
157167 req * heartbeatpb.AddMaintainerRequest ,
158- target node.ID ,
159- epoch uint64 ,
168+ getDrainTarget func () (node.ID , uint64 ),
160169) * heartbeatpb.MaintainerStatus {
161170 changefeedID := common .NewChangefeedIDFromPB (req .Id )
162- _ , ok := p .registry .Load (changefeedID )
163- if ok {
171+ if _ , ok := p .registry .Load (changefeedID ); ok {
164172 return nil
165173 }
166174
167175 info := & config.ChangeFeedInfo {}
168- err := json .Unmarshal (req .Config , info )
169- if err != nil {
170- log .Panic ("decode changefeed fail" , zap .Error (err ))
176+ if err := json .Unmarshal (req .Config , info ); err != nil {
177+ log .Error ("ignore add maintainer request with invalid config" ,
178+ zap .Stringer ("changefeedID" , changefeedID ),
179+ zap .Int ("configBytes" , len (req .Config )),
180+ zap .Error (err ))
181+ return nil
171182 }
172183 if req .CheckpointTs == 0 {
173- log .Panic ( " add maintainer with invalid checkpointTs" ,
184+ log .Error ( "ignore add maintainer request with invalid checkpointTs" ,
174185 zap .Stringer ("changefeedID" , changefeedID ),
175- zap .Uint64 ("checkpointTs" , req .CheckpointTs ),
176- zap . Any ( "info" , info ))
186+ zap .Uint64 ("checkpointTs" , req .CheckpointTs ))
187+ return nil
177188 }
178-
179189 maintainer := NewMaintainer (changefeedID , p .conf , info , p .nodeInfo , p .taskScheduler , req .CheckpointTs , req .IsNewChangefeed , req .KeyspaceId )
180- // Seed the maintainer with the manager-level drain snapshot before its event
181- // loop starts so late additions still honor an already-active drain target.
182- maintainer .SetDispatcherDrainTarget (target , epoch )
183- p .registry .Store (changefeedID , maintainer )
184- maintainer .pushEvent (& Event {changefeedID : changefeedID , eventType : EventInit })
190+ registered , loaded := p .registry .LoadOrStore (changefeedID , maintainer )
191+ if loaded {
192+ // Duplicate add requests can race on the same changefeed. Drop the loser and
193+ // stop the redundant maintainer immediately so background goroutines do not leak.
194+ maintainer .Close ()
195+ return nil
196+ }
197+
198+ registeredMaintainer := registered .(* Maintainer )
199+ // Register the maintainer before seeding the drain snapshot so concurrent
200+ // manager-level drain fanout can always observe it in the registry.
201+ target , epoch := getDrainTarget ()
202+ registeredMaintainer .SetDispatcherDrainTarget (target , epoch )
203+ registeredMaintainer .pushEvent (& Event {changefeedID : changefeedID , eventType : EventInit })
185204 return nil
186205}
187206
@@ -276,7 +295,7 @@ func (p *managerMaintainerSet) dispatchMaintainerMessage(
276295 }
277296 select {
278297 case <- ctx .Done ():
279- return ctx .Err ()
298+ return errors . Trace ( ctx .Err () )
280299 default :
281300 maintainer := c .(* Maintainer )
282301 maintainer .pushEvent (& Event {
0 commit comments