diff --git a/pkg/util/admission/admissionpb/io_threshold.go b/pkg/util/admission/admissionpb/io_threshold.go index 7de60b4c4139..05f3983abb45 100644 --- a/pkg/util/admission/admissionpb/io_threshold.go +++ b/pkg/util/admission/admissionpb/io_threshold.go @@ -14,21 +14,24 @@ import ( // Score returns, as the second return value, whether IO admission control is // considering the Store overloaded wrt compaction of L0. The first return -// value is a 1-normalized float (i.e. 1.0 is the threshold at which the -// second value flips to true). +// value is a 1-normalized float, where 1.0 represents severe overload, and +// therefore 1.0 is the threshold at which the second value flips to true. +// Admission control currently trys to maintain a store around a score +// threshold of 0.5 for regular work and lower than 0.25 for elastic work. NB: +// this is an incomplete representation of the signals considered by admission +// control -- admission control additionally considers disk and flush +// throughput bottlenecks. // // The zero value returns (0, false). Use of the nil pointer is not allowed. // -// TODO(sumeer): consider whether we need to enhance this to incorporate -// overloading via flush bandwidth. I suspect we can get away without -// incorporating flush bandwidth since typically chronic overload will be due -// to compactions falling behind (though that may change if we increase the -// max number of compactions). And we will need to incorporate overload due to -// disk bandwidth bottleneck. -// // NOTE: Future updates to the scoring function should be version gated as the // threshold is gossiped and used to determine lease/replica placement via the // allocator. +// +// IOThreshold has various parameters that can evolve over time. The source of +// truth for an IOThreshold struct is admission.ioLoadListener, and is +// propagated elsewhere using the admission.IOThresholdConsumer interface. No +// other production code should create one from scratch. func (iot *IOThreshold) Score() (float64, bool) { // iot.L0NumFilesThreshold and iot.L0NumSubLevelsThreshold are initialized to // 0 by default, and there appears to be a period of time before we update diff --git a/pkg/util/admission/io_load_listener.go b/pkg/util/admission/io_load_listener.go index 761ef3239e1e..f9a53de44433 100644 --- a/pkg/util/admission/io_load_listener.go +++ b/pkg/util/admission/io_load_listener.go @@ -119,7 +119,12 @@ var walFailoverUnlimitedTokens = settings.RegisterBoolSetting( "when true, during WAL failover, unlimited admission tokens are allocated", false) -// Experimental observations: +// The following experimental observations were used to guide the initial +// implementation, which aimed to maintain a sub-level count of 20 with token +// calculation every 60s. Since then, the code has evolved to calculate tokens +// every 15s and to aim for regular work maintaining a sub-level count of +// l0SubLevelCountOverloadThreshold/2. So this commentary should be +// interpreted in that context: // - Sub-level count of ~40 caused a node heartbeat latency p90, p99 of 2.5s, // 4s. With a setting that limits sub-level count to 10, before the system // is considered overloaded, and adjustmentInterval = 60, we see the actual @@ -133,9 +138,35 @@ var walFailoverUnlimitedTokens = settings.RegisterBoolSetting( // then we run the risk of having 100+ sub-levels when we hit a file count // of 1000. Instead we use a sub-level overload threshold of 20. // -// We've set these overload thresholds in a way that allows the system to -// absorb short durations (say a few minutes) of heavy write load. -const l0FileCountOverloadThreshold = 1000 +// A sub-level count of l0SubLevelCountOverloadThreshold results in the same +// score as a file count of l0FileCountOverloadThreshold. Exceptions: a small +// L0 in terms of bytes (see IOThreshold.Score); these constants being +// overridden in the cluster settings +// admission.l0_sub_level_count_overload_threshold and +// admission.l0_file_count_overload_threshold. We ignore these exceptions in +// the discussion here. Hence, 20 sub-levels is equivalent in score to 4000 L0 +// files, i.e., 1 sub-level is equivalent to 200 files. +// +// Ideally, equivalence here should match equivalence in how L0 is scored for +// compactions. CockroachDB sets Pebble's L0CompactionThreshold to a constant +// value of 2, which results in a compaction score of 1.0 with 1 sub-level. +// CockroachDB does not override Pebble's L0CompactionFileThreshold, which +// defaults to 500, so 500 files cause a compaction score of 1.0. So in +// Pebble's compaction scoring logic, 1 sub-level is equivalent to 500 L0 +// files. +// +// So admission control is more sensitive to higher file count than Pebble's +// compaction scoring. l0FileCountOverloadThreshold used to be 1000 up to +// v24.3, and increasing it to 4000 was considered significant enough -- +// increasing to 10000, to make Pebble's compaction logic and admission +// control equivalent was considered too risky. Note that admission control +// tries to maintain a score of 0.5 when admitting regular work, which if +// caused by file count represent 2000 files. With 2000 files, the L0 +// compaction score is 2000/500 = 4.0, which is significantly above the +// compaction threshold of 1.0 (at which a level is eligible for compaction). +// So one could argue that this inconsistency between admission control and +// Pebble is potentially harmless. +const l0FileCountOverloadThreshold = 4000 const l0SubLevelCountOverloadThreshold = 20 // ioLoadListener adjusts tokens in kvStoreTokenGranter for IO, specifically due to