cockroachdb · celiala · May 13, 2025 · Jan 9, 2025
diff --git a/pkg/util/admission/io_load_listener.go b/pkg/util/admission/io_load_listener.go
@@ -762,37 +762,45 @@ func (io *ioLoadListener) adjustTokensInner(
 	// primary WAL location, which is also the location to which the store
 	// flushes and compacts, may be unhealthy. If it is unhealthy, flushes and
 	// compactions can stall, which can result in artificially low token counts
-	// for flushes and compactions, which can unnecessarily throttle work.
+	// for flushes and compactions, which can unnecessarily throttle work. It is
+	// also possible that the primary WAL location was transiently observed to
+	// be slow, and flushes and compactions are mostly unaffected, and may even
+	// be increasing in their rate, during WAL failover, if the workload is
+	// increasing its write rate.
 	//
 	// We make the assumption that failover will be very aggressive compared to
 	// the interval at which this token computation is happening (15s). An
-	// UnhealthyOperationLatencyThreshold of 1s or lower means that an interval
-	// in which intWALFailover was false could at worst have had its last 1s
-	// have stalled flushes/compactions. So the throughput observed here will be
-	// 93.3% of what would have been possible with a healthy primary, which is
-	// considered acceptable.
+	// UnhealthyOperationLatencyThreshold of 100ms (the default) means that an
+	// interval in which intWALFailover was false could at worst have had its
+	// last 100ms have stalled flushes/compactions. So the throughput observed
+	// here will be 99.3% of what would have been possible with a healthy
+	// primary, which is considered acceptable.
 	//
 	// We also make the assumption that failback will be reasonably aggressive
-	// once the primary is considered healthy, say within 10s. So a disk stall
-	// in the primary that lasts 30s, will cause WAL failover for ~40s, and a
-	// disk stall for 1s will cause failover for ~11s. The latter (11s) is short
-	// enough that we could potentially allow unlimited tokens during failover.
-	// The concern is the former case, where unlimited tokens could result in
-	// excessive admission into L0. So the default behavior when intWALFailover
-	// is true is to (a) continue using the compaction tokens from before the
-	// failover, (b) not constrain flush tokens, (c) constrain elastic traffic
+	// once the primary is considered healthy (HealthyInterval uses the default
+	// of 15s). So a disk stall in the primary that lasts 30s, will cause WAL
+	// failover for ~45s, and a disk stall for 1s will cause failover for ~16s.
+	// The latter (16s) is short enough that we could potentially allow
+	// unlimited tokens during failover. The concern is the former case, where
+	// unlimited tokens could result in excessive admission into L0. So the
+	// default behavior when intWALFailover is true is to (a) continue using the
+	// compaction tokens from before the failover, unless the compaction rate is
+	// increasing (b) not constrain flush tokens, (c) constrain elastic traffic
 	// to effectively 0 tokens. We allow this behavior to be overridden to have
 	// unlimited tokens.
 	intWALFailover := cumWALSecondaryWriteDuration-io.cumWALSecondaryWriteDuration > 0
 	var smoothedIntL0CompactedBytes int64
-	if intWALFailover {
-		// Reuse previous smoothed value.
+	var updatedSmoothedIntL0CompactedBytes bool
+	if intWALFailover && intL0CompactedBytes < prev.smoothedIntL0CompactedBytes {
+		// Reuse previous smoothed value since the decrease in compaction bytes
+		// could be due to an unhealthy primary WAL location.
 		smoothedIntL0CompactedBytes = prev.smoothedIntL0CompactedBytes
 	} else {
 		// Compaction scheduling can be uneven in prioritizing L0 for compactions,
 		// so smooth out what is being removed by compactions.
 		smoothedIntL0CompactedBytes = int64(alpha*float64(intL0CompactedBytes) +
 			(1-alpha)*float64(prev.smoothedIntL0CompactedBytes))
+		updatedSmoothedIntL0CompactedBytes = true
 	}
 
 	// Flush tokens:
@@ -1007,7 +1015,7 @@ func (io *ioLoadListener) adjustTokensInner(
 	// Overload: Score is >= 2. We limit compaction tokens, and limit tokens to
 	// at most C/2 tokens.
 	if score < 0.5 {
-		if intWALFailover {
+		if !updatedSmoothedIntL0CompactedBytes {
 			smoothedCompactionByteTokens = prev.smoothedCompactionByteTokens
 		} else {
 			// Underload. Maintain a smoothedCompactionByteTokens based on what was
@@ -1023,7 +1031,7 @@ func (io *ioLoadListener) adjustTokensInner(
 		totalNumByteTokens = unlimitedTokens
 	} else {
 		doLogFlush = true
-		if intWALFailover {
+		if !updatedSmoothedIntL0CompactedBytes {
 			smoothedCompactionByteTokens = prev.smoothedCompactionByteTokens
 		} else {
 			var fTotalNumByteTokens float64

diff --git a/pkg/util/admission/testdata/io_load_listener b/pkg/util/admission/testdata/io_load_listener
@@ -426,6 +426,23 @@ store-request-estimates: writeTokens: 10000
 tick: 0, setAdmittedDoneModelsLocked: l0-write-lm: 1.37x+1 l0-ingest-lm: 0.75x+1 ingest-lm: 1.00x+1 write-amp-lm: 50.50x+1
 setAvailableTokens: io-tokens=417(elastic 1) elastic-disk-bw-tokens=unlimited max-byte-tokens=417(elastic 1) max-disk-bw-tokens=unlimited lastTick=false
 
+prep-admission-stats admitted=10 write-bytes=200000
+----
+{workCount:10 writeAccountedBytes:200000 ingestedAccountedBytes:0 statsToIgnore:{ingestStats:{Bytes:0 ApproxIngestedIntoL0Bytes:0 MemtableOverlappingFiles:0} writeBytes:0} aboveRaftStats:{workCount:10 writeAccountedBytes:200000 ingestedAccountedBytes:0} aux:{bypassedCount:0 writeBypassedAccountedBytes:0 ingestedBypassedAccountedBytes:0}}
+
+# Compactions remove 700,000 bytes from L0. Even though WAL failover is still
+# active, this higher compaction rate is incorporated into giving out more
+# regular tokens. Specifically, the smoothed interval compacted rate increases
+# to 391 KiB, and the regular tokens increase to 208 KiB.
+set-state l0-bytes=101000 l0-added-write=1001000 l0-files=10 l0-sublevels=10 print-only-first-tick=true wal-secondary-write-sec=1 flush-bytes=1000 flush-work-sec=8 flush-idle-sec=10 write-stall-count=0
+----
+compaction score 0.500 (10 ssts, 10 sub-levels), L0 growth 195 KiB (write 195 KiB (ignored 0 B) ingest 0 B (ignored 0 B)): requests 0 (0 bypassed) with 0 B acc-write (0 B bypassed) + 0 B acc-ingest (0 B bypassed) + 195 KiB adjusted-LSM-writes + 0 B adjusted-disk-writes + write-model 0.00x+0 B (smoothed 1.37x+1 B) + ingested-model 0.00x+0 B (smoothed 0.75x+1 B) + write-amp-model 0.00x+0 B (smoothed 50.50x+1 B) + at-admission-tokens 9.8 KiB, compacted 684 KiB [≈391 KiB], flushed 366 KiB [≈0 B] (mult 1.00); admitting (WAL failover) 208 KiB (rate 14 KiB/s) (elastic 1 B rate 0 B/s) due to L0 growth (used total: 0 B elastic 0 B); write stalls -4
+diskBandwidthLimiter (tokenUtilization 0.00, tokensUsed (elastic 0 B, snapshot 0 B, regular 0 B) tokens (write 0 B (prev 0 B)), writeBW 0 B/s, readBW 0 B/s, provisioned 0 B/s)
+{ioLoadListenerState:{cumL0AddedBytes:1001000 curL0Bytes:101000 cumWriteStallCount:0 cumFlushWriteThroughput:{Bytes:1001000 WorkDuration:20000000000 IdleDuration:420000000000} diskBW:{bytesRead:0 bytesWritten:0} cumCompactionStats:{writeBytes:0 numOutLevelsGauge:1} cumWALSecondaryWriteDuration:4000000000 smoothedIntL0CompactedBytes:400000 smoothedCompactionByteTokens:212500 smoothedNumFlushTokens:0 flushUtilTargetFraction:1 totalNumByteTokens:212500 byteTokensAllocated:0 byteTokensUsed:0 byteTokensUsedByElasticWork:0 totalNumElasticByteTokens:1 elasticByteTokensAllocated:0 elasticDiskWriteTokens:9223372036854775807 elasticDiskWriteTokensAllocated:0} requestEstimates:{writeTokens:10000} l0WriteLM:{multiplier:1.374975 constant:1} l0IngestLM:{multiplier:0.7505 constant:1} ingestLM:{multiplier:1 constant:1} writeAmpLM:{multiplier:50.5 constant:1} aux:{intL0AddedBytes:200000 intL0CompactedBytes:700000 intFlushTokens:375000 intFlushUtilization:0.4444444444444444 intWriteStalls:-4 intWALFailover:true prevTokensUsed:0 prevTokensUsedByElasticWork:0 tokenKind:0 usedCompactionTokensLowerBound:false perWorkTokensAux:{intWorkCount:0 intL0WriteBytes:200000 intL0IngestedBytes:0 intLSMIngestedBytes:0 intL0WriteAccountedBytes:0 intIngestedAccountedBytes:0 intL0WriteLinearModel:{multiplier:0 constant:0} intL0IngestedLinearModel:{multiplier:0 constant:0} intIngestedLinearModel:{multiplier:0 constant:0} intWriteAmpLinearModel:{multiplier:0 constant:0} intBypassedWorkCount:0 intL0WriteBypassedAccountedBytes:0 intIngestedBypassedAccountedBytes:0 intL0IgnoredWriteBytes:0 intL0IgnoredIngestedBytes:0 intAdjustedLSMWrites:200000 intAdjustedDiskWriteBytes:0} doLogFlush:true} ioThreshold:<nil>}
+store-request-estimates: writeTokens: 10000
+tick: 0, setAdmittedDoneModelsLocked: l0-write-lm: 1.37x+1 l0-ingest-lm: 0.75x+1 ingest-lm: 1.00x+1 write-amp-lm: 50.50x+1
+setAvailableTokens: io-tokens=3542(elastic 1) elastic-disk-bw-tokens=unlimited max-byte-tokens=3542(elastic 1) max-disk-bw-tokens=unlimited lastTick=false
+
 # Test case with flush tokens.
 init
 ----