@@ -221,6 +221,8 @@ import Ouroboros.Consensus.MiniProtocol.ChainSync.Client.State
221
221
DisengagedInitState (.. ), DynamoInitState (.. ),
222
222
JumpInfo (.. ), JumperInitState (.. ),
223
223
ObjectorInitState (.. ))
224
+ import Ouroboros.Consensus.Node.GsmState (GsmState )
225
+ import qualified Ouroboros.Consensus.Node.GsmState as GSM
224
226
import Ouroboros.Consensus.Util
225
227
import Ouroboros.Consensus.Util.IOLike hiding (handle )
226
228
import qualified Ouroboros.Network.AnchoredFragment as AF
@@ -776,28 +778,95 @@ newJumper jumpInfo jumperState = do
776
778
-- | Register a new ChainSync client to a context, returning a 'PeerContext' for
777
779
-- that peer. If there is no dynamo, the peer starts as dynamo; otherwise, it
778
780
-- starts as a jumper.
781
+ --
782
+ -- @Note [Updating the CSJ State when the GSM State Changes]@:
783
+ --
784
+ -- The 'GsmState' argument to this function is the only way that the state of
785
+ -- the GSM influences CSJ. In particular, when the GSM state changes, the CSJ
786
+ -- state does not need any updates whatsoever. That is remarkable enough to
787
+ -- deserve some explanation.
788
+ --
789
+ -- - The 'GsmState' argument to this function merely causes a new client to be
790
+ -- immediately disengaged if the GSM is currently in 'GSM.CaughtUp'.
791
+ -- Otherwise, CSJ will initialize that peer as a Jumper instead of running
792
+ -- full ChainSync (unless they happen to be immediately promoted to Dynamo,
793
+ -- eg they're the first upstream peer).
794
+ --
795
+ -- - The transition into 'GSM.CaughtUp' does not raise any design questions.
796
+ -- The GSM only makes that transition when all peers are idle, and an idle
797
+ -- peer will have already disengaged from CSJ. So CSJ doesn't need to react
798
+ -- to this transition.
799
+ --
800
+ -- - The GSM only transitions out of 'GSM.CaughtUp' if the tip of its selection
801
+ -- is much older than expected (eg 20 minutes). There are many possible
802
+ -- explanations for why that could have happened, so it's not obvious what is
803
+ -- the best reaction to that transition. This is the interesting case.
804
+ --
805
+ -- The relevant high-level assumption is that in the moment the GSM exits the
806
+ -- 'GSM.CaughtUp' state, either (i) the node has no proper upstream peers or
807
+ -- (ii) the node's selection is out-of-date but not by a huge amount.
808
+ --
809
+ -- - If the node has no peers, then the CSJ state doesn't need any updates: all
810
+ -- of its state is peer-specific. This is anticipated as the main reason the
811
+ -- CSJ will leave 'GSM.CaughtUp': eg when the node process was asleep because
812
+ -- the user closed the laptop lid overnight.
813
+ --
814
+ -- - If the node still has peers, then note that they are already disengaged
815
+ -- from CSJ, since the GSM was in 'GSM.CaughtUp'. The only reason to
816
+ -- re-engage them would be to prevent unnecessary load on them. The key
817
+ -- design decision here is that the potential load the node's current peers
818
+ -- might be able to avoid if they re-engage CSJ from is not worth the extra
819
+ -- complexity in CSJ. It's only ~20min worth of ChainSync headers. And if the
820
+ -- node hadn't been, eg, asleep last ~20min, those peers would have all sent
821
+ -- those headers anyway---the only difference is that the load arrives in a
822
+ -- burst.
823
+ --
824
+ -- One key remark: the transition out of 'GSM.CaughtUp' does (elsewhere)
825
+ -- re-enable the LoP, the LoE, and the GDD, and they apply to all peers
826
+ -- regardless of whether those peers are disengaged from CSJ. So security is
827
+ -- not directly relevant to this question---recall that CSJ is merely an
828
+ -- optimization to avoid excess load on honest upstream peers.
779
829
registerClient ::
780
830
( LedgerSupportsProtocol blk ,
781
831
IOLike m
782
832
) =>
833
+ GsmState ->
834
+ -- ^ the GSM state as of when the node connected to the upstream peer
783
835
Context m peer blk ->
784
836
peer ->
785
837
StrictTVar m (ChainSyncState blk ) ->
786
838
-- | A function to make a client handle from a jumping state.
787
839
(StrictTVar m (ChainSyncJumpingState m blk ) -> ChainSyncClientHandle m blk ) ->
788
840
STM m (PeerContext m peer blk , Maybe (TraceEventCsj peer blk ))
789
- registerClient context peer csState mkHandle = do
790
- (csjState, mbEv) <- getDynamo (handlesCol context) >>= \ case
841
+ registerClient gsmState context peer csState mkHandle = do
842
+ (csjState, mbEv) <- case gsmState of
843
+ GSM. CaughtUp -> pure (Disengaged DisengagedDone , Nothing )
844
+ -- This branch disables CSJ while the GSM is in the CaughtUp state.
845
+ GSM. PreSyncing -> engageClient context csState
846
+ GSM. Syncing -> engageClient context csState
847
+ cschJumping <- newTVar csjState
848
+ let handle = mkHandle cschJumping
849
+ cschcAddHandle (handlesCol context) peer handle
850
+ pure (context {peer, handle}, mbEv)
851
+
852
+ -- | A helper for 'registerClient'
853
+ --
854
+ -- /NOT EXPORTED/
855
+ engageClient ::
856
+ ( LedgerSupportsProtocol blk ,
857
+ IOLike m
858
+ ) =>
859
+ Context m peer blk ->
860
+ StrictTVar m (ChainSyncState blk ) ->
861
+ STM m (ChainSyncJumpingState m blk , Maybe (TraceEventCsj peer blk ))
862
+ engageClient context csState = do
863
+ getDynamo (handlesCol context) >>= \ case
791
864
Nothing -> do
792
865
fragment <- csCandidate <$> readTVar csState
793
866
pure (Dynamo DynamoStarted $ pointSlot $ AF. anchorPoint fragment, Just InitializedAsDynamo )
794
867
Just (_, handle) -> do
795
868
mJustInfo <- readTVar (cschJumpInfo handle)
796
869
(\ x -> (x, Nothing )) <$> newJumper mJustInfo (Happy FreshJumper Nothing )
797
- cschJumping <- newTVar csjState
798
- let handle = mkHandle cschJumping
799
- cschcAddHandle (handlesCol context) peer handle
800
- pure (context {peer, handle}, mbEv)
801
870
802
871
-- | Unregister a client from a 'PeerContext'; this might trigger the election
803
872
-- of a new dynamo or objector if the peer was one of these two.
0 commit comments