Skip to content

Commit dfa1708

Browse files
authored
ChainSync: let GSM disable and re-enable CSJ; also enable LoP in PreSyncing (#1492)
Fixes #1490. Contributes to #1491.
2 parents e5be6f0 + 4b80165 commit dfa1708

File tree

6 files changed

+414
-11
lines changed

6 files changed

+414
-11
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!--
2+
A new scriv changelog fragment.
3+
4+
Uncomment the section that is right (remove the HTML comment wrapper).
5+
-->
6+
7+
### Patch
8+
9+
- Bugfix in Ouroboros Genesis. Added the @GsmState@ argument to the
10+
@registerClient@ function for the ChainSync Jumping optimization. If the node
11+
is in the @GSM.CaughtUp@ state, new peers now immediately disengage from CSJ.
12+
13+
<!--
14+
### Non-Breaking
15+
16+
- A bullet item for the Non-Breaking category.
17+
18+
-->
19+
20+
<!--
21+
### Breaking
22+
23+
- A bullet item for the Breaking category.
24+
25+
-->

ouroboros-consensus/ouroboros-consensus.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,7 @@ test-suite consensus-test
585585
Test.Consensus.Mempool.StateMachine
586586
Test.Consensus.Mempool.Util
587587
Test.Consensus.MiniProtocol.BlockFetch.Client
588+
Test.Consensus.MiniProtocol.ChainSync.CSJ
588589
Test.Consensus.MiniProtocol.ChainSync.Client
589590
Test.Consensus.MiniProtocol.LocalStateQuery.Server
590591
Test.Consensus.Util.MonadSTM.NormalForm

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/MiniProtocol/ChainSync/Client.hs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -410,8 +410,8 @@ bracketChainSyncClient
410410
, cschJumpInfo
411411
}
412412
insertHandle = atomicallyWithMonotonicTime $ \time -> do
413-
initialGsmState <- getGsmState
414-
updateLopBucketConfig lopBucket initialGsmState time
413+
gsmState <- getGsmState
414+
updateLopBucketConfig lopBucket gsmState time
415415
cschcAddHandle varHandles peer handle
416416
deleteHandle = atomically $ cschcRemoveHandle varHandles peer
417417
bracket_ insertHandle deleteHandle $ f Jumping.noJumping
@@ -424,13 +424,15 @@ bracketChainSyncClient
424424
acquireContext lopBucket cschState (CSJEnabledConfig jumpSize) = do
425425
tid <- myThreadId
426426
atomicallyWithMonotonicTime $ \time -> do
427-
initialGsmState <- getGsmState
428-
updateLopBucketConfig lopBucket initialGsmState time
427+
gsmState <- getGsmState
428+
updateLopBucketConfig lopBucket gsmState time
429429
cschJumpInfo <- newTVar Nothing
430430
context <- Jumping.makeContext varHandles jumpSize tracerCsj
431-
Jumping.registerClient context peer cschState $ \cschJumping -> ChainSyncClientHandle
431+
Jumping.registerClient gsmState context peer cschState $ \cschJumping -> ChainSyncClientHandle
432432
{ cschGDDKill = throwTo tid DensityTooLow
433433
, cschOnGsmStateChanged = updateLopBucketConfig lopBucket
434+
-- See Note [Updating the CSJ State when the GSM State Changes]
435+
-- in the Haddocks of 'Jumping.registerClient'.
434436
, cschState
435437
, cschJumping
436438
, cschJumpInfo

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/MiniProtocol/ChainSync/Client/Jumping.hs

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ import Ouroboros.Consensus.MiniProtocol.ChainSync.Client.State
221221
DisengagedInitState (..), DynamoInitState (..),
222222
JumpInfo (..), JumperInitState (..),
223223
ObjectorInitState (..))
224+
import Ouroboros.Consensus.Node.GsmState (GsmState)
225+
import qualified Ouroboros.Consensus.Node.GsmState as GSM
224226
import Ouroboros.Consensus.Util
225227
import Ouroboros.Consensus.Util.IOLike hiding (handle)
226228
import qualified Ouroboros.Network.AnchoredFragment as AF
@@ -776,28 +778,95 @@ newJumper jumpInfo jumperState = do
776778
-- | Register a new ChainSync client to a context, returning a 'PeerContext' for
777779
-- that peer. If there is no dynamo, the peer starts as dynamo; otherwise, it
778780
-- starts as a jumper.
781+
--
782+
-- @Note [Updating the CSJ State when the GSM State Changes]@:
783+
--
784+
-- The 'GsmState' argument to this function is the only way that the state of
785+
-- the GSM influences CSJ. In particular, when the GSM state changes, the CSJ
786+
-- state does not need any updates whatsoever. That is remarkable enough to
787+
-- deserve some explanation.
788+
--
789+
-- - The 'GsmState' argument to this function merely causes a new client to be
790+
-- immediately disengaged if the GSM is currently in 'GSM.CaughtUp'.
791+
-- Otherwise, CSJ will initialize that peer as a Jumper instead of running
792+
-- full ChainSync (unless they happen to be immediately promoted to Dynamo,
793+
-- eg they're the first upstream peer).
794+
--
795+
-- - The transition into 'GSM.CaughtUp' does not raise any design questions.
796+
-- The GSM only makes that transition when all peers are idle, and an idle
797+
-- peer will have already disengaged from CSJ. So CSJ doesn't need to react
798+
-- to this transition.
799+
--
800+
-- - The GSM only transitions out of 'GSM.CaughtUp' if the tip of its selection
801+
-- is much older than expected (eg 20 minutes). There are many possible
802+
-- explanations for why that could have happened, so it's not obvious what is
803+
-- the best reaction to that transition. This is the interesting case.
804+
--
805+
-- The relevant high-level assumption is that in the moment the GSM exits the
806+
-- 'GSM.CaughtUp' state, either (i) the node has no proper upstream peers or
807+
-- (ii) the node's selection is out-of-date but not by a huge amount.
808+
--
809+
-- - If the node has no peers, then the CSJ state doesn't need any updates: all
810+
-- of its state is peer-specific. This is anticipated as the main reason the
811+
-- CSJ will leave 'GSM.CaughtUp': eg when the node process was asleep because
812+
-- the user closed the laptop lid overnight.
813+
--
814+
-- - If the node still has peers, then note that they are already disengaged
815+
-- from CSJ, since the GSM was in 'GSM.CaughtUp'. The only reason to
816+
-- re-engage them would be to prevent unnecessary load on them. The key
817+
-- design decision here is that the potential load the node's current peers
818+
-- might be able to avoid if they re-engage CSJ from is not worth the extra
819+
-- complexity in CSJ. It's only ~20min worth of ChainSync headers. And if the
820+
-- node hadn't been, eg, asleep last ~20min, those peers would have all sent
821+
-- those headers anyway---the only difference is that the load arrives in a
822+
-- burst.
823+
--
824+
-- One key remark: the transition out of 'GSM.CaughtUp' does (elsewhere)
825+
-- re-enable the LoP, the LoE, and the GDD, and they apply to all peers
826+
-- regardless of whether those peers are disengaged from CSJ. So security is
827+
-- not directly relevant to this question---recall that CSJ is merely an
828+
-- optimization to avoid excess load on honest upstream peers.
779829
registerClient ::
780830
( LedgerSupportsProtocol blk,
781831
IOLike m
782832
) =>
833+
GsmState ->
834+
-- ^ the GSM state as of when the node connected to the upstream peer
783835
Context m peer blk ->
784836
peer ->
785837
StrictTVar m (ChainSyncState blk) ->
786838
-- | A function to make a client handle from a jumping state.
787839
(StrictTVar m (ChainSyncJumpingState m blk) -> ChainSyncClientHandle m blk) ->
788840
STM m (PeerContext m peer blk, Maybe (TraceEventCsj peer blk))
789-
registerClient context peer csState mkHandle = do
790-
(csjState, mbEv) <- getDynamo (handlesCol context) >>= \case
841+
registerClient gsmState context peer csState mkHandle = do
842+
(csjState, mbEv) <- case gsmState of
843+
GSM.CaughtUp -> pure (Disengaged DisengagedDone, Nothing)
844+
-- This branch disables CSJ while the GSM is in the CaughtUp state.
845+
GSM.PreSyncing -> engageClient context csState
846+
GSM.Syncing -> engageClient context csState
847+
cschJumping <- newTVar csjState
848+
let handle = mkHandle cschJumping
849+
cschcAddHandle (handlesCol context) peer handle
850+
pure (context {peer, handle}, mbEv)
851+
852+
-- | A helper for 'registerClient'
853+
--
854+
-- /NOT EXPORTED/
855+
engageClient ::
856+
( LedgerSupportsProtocol blk,
857+
IOLike m
858+
) =>
859+
Context m peer blk ->
860+
StrictTVar m (ChainSyncState blk) ->
861+
STM m (ChainSyncJumpingState m blk, Maybe (TraceEventCsj peer blk))
862+
engageClient context csState = do
863+
getDynamo (handlesCol context) >>= \case
791864
Nothing -> do
792865
fragment <- csCandidate <$> readTVar csState
793866
pure (Dynamo DynamoStarted $ pointSlot $ AF.anchorPoint fragment, Just InitializedAsDynamo)
794867
Just (_, handle) -> do
795868
mJustInfo <- readTVar (cschJumpInfo handle)
796869
(\x -> (x, Nothing)) <$> newJumper mJustInfo (Happy FreshJumper Nothing)
797-
cschJumping <- newTVar csjState
798-
let handle = mkHandle cschJumping
799-
cschcAddHandle (handlesCol context) peer handle
800-
pure (context {peer, handle}, mbEv)
801870

802871
-- | Unregister a client from a 'PeerContext'; this might trigger the election
803872
-- of a new dynamo or objector if the peer was one of these two.

ouroboros-consensus/test/consensus-test/Main.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import qualified Test.Consensus.Mempool.Fairness (tests)
1313
import qualified Test.Consensus.Mempool.StateMachine (tests)
1414
import qualified Test.Consensus.MiniProtocol.BlockFetch.Client (tests)
1515
import qualified Test.Consensus.MiniProtocol.ChainSync.Client (tests)
16+
import qualified Test.Consensus.MiniProtocol.ChainSync.CSJ (tests)
1617
import qualified Test.Consensus.MiniProtocol.LocalStateQuery.Server (tests)
1718
import qualified Test.Consensus.Util.MonadSTM.NormalForm (tests)
1819
import qualified Test.Consensus.Util.Versioned (tests)
@@ -29,6 +30,7 @@ tests =
2930
[ Test.Consensus.BlockchainTime.Simple.tests
3031
, Test.Consensus.HeaderValidation.tests
3132
, Test.Consensus.MiniProtocol.BlockFetch.Client.tests
33+
, Test.Consensus.MiniProtocol.ChainSync.CSJ.tests
3234
, Test.Consensus.MiniProtocol.ChainSync.Client.tests
3335
, Test.Consensus.MiniProtocol.LocalStateQuery.Server.tests
3436
, testGroup "Mempool"

0 commit comments

Comments
 (0)