Skip to content

Commit f99307e

Browse files
Fix flaky FeatureFlagRegistrySpec / ProviderHotSwapSpec: guard async PROVIDER_READY race after failed setProvider (#162)
1 parent 9cb9ab3 commit f99307e

1 file changed

Lines changed: 33 additions & 3 deletions

File tree

core/src/main/scala/zio/openfeature/FeatureFlagsLive.scala

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,21 @@ final private[openfeature] class FeatureFlagsLive(
3131
evaluationTimeout: Option[Duration] = None
3232
) extends FeatureFlags {
3333

34+
// Records when a `setProvider` swap last failed. Used by the async PROVIDER_READY bridge to decide whether an
35+
// incoming Ready event is a real recovery signal or a stale event left over from a previous attach that's racing
36+
// against the explicit Error transition set by the failed swap. See `setProvider` and `readyHandler` below.
37+
//
38+
// Initialised to `0L` (epoch). `currentTimeMillis() - 0L` will always be far beyond `FailedSwapGuardMillis`, so the
39+
// guard never trips before the first failed swap. (Using `Long.MinValue` instead would overflow the subtraction and
40+
// wrap negative, causing the guard to trip incorrectly and block legitimate Error → Ready recoveries.)
41+
private val recentSwapFailureAt = new java.util.concurrent.atomic.AtomicLong(0L)
42+
43+
// How long after a failed swap an async PROVIDER_READY event should be ignored as a likely stale signal. Real
44+
// recovery scenarios (provider was in Error for an extended period and genuinely transitions back to Ready)
45+
// happen on a much longer timescale than this; the race window we're closing is the SDK's emitter executor
46+
// dispatching a queued event that pre-dates our explicit Error.
47+
private val FailedSwapGuardMillis: Long = 500L
48+
3449
// Bridge Java SDK provider events to ZIO event system
3550
private[openfeature] def startEventBridge: ZIO[Scope, Nothing, Unit] = {
3651
// Read provider name dynamically so events after a provider swap use the new name
@@ -67,7 +82,19 @@ final private[openfeature] class FeatureFlagsLive(
6782
val readyHandler: java.util.function.Consumer[EventDetails] = details => {
6883
val em = extractEventMetadata(details)
6984
runHandler(runtime, "PROVIDER_READY")(
70-
state.statusRef.set(ProviderStatus.Ready) *>
85+
// Transition statusRef only from states where PROVIDER_READY is meaningful. The `Error => Ready` arrow is
86+
// valid per the OpenFeature spec (recovery from a recoverable error) but is guarded by
87+
// `FailedSwapGuardMillis`: if the most recent statusRef write was a failed-swap Error within that window,
88+
// a Ready event arriving now is almost certainly a stale signal queued on the SDK's emitter executor before
89+
// the swap, not a genuine recovery. Real recoveries happen on timescales much longer than the guard.
90+
state.statusRef.update {
91+
case ProviderStatus.NotReady => ProviderStatus.Ready
92+
case ProviderStatus.Stale => ProviderStatus.Ready
93+
case ProviderStatus.Error =>
94+
val sinceFailure = java.lang.System.currentTimeMillis() - recentSwapFailureAt.get()
95+
if (sinceFailure >= FailedSwapGuardMillis) ProviderStatus.Ready else ProviderStatus.Error
96+
case other => other
97+
} *>
7198
state.eventHub.publish(ProviderEvent.Ready(currentMetadata(runtime), em)).unit
7299
)
73100
onReady.foreach(_.countDown())
@@ -816,8 +843,11 @@ final private[openfeature] class FeatureFlagsLive(
816843
case None => ZIO.attemptBlocking(api.setProviderAndWait(newProvider))
817844
}).mapError(e => FeatureFlagError.ProviderInitializationFailed(e))
818845
.tapError(_ =>
819-
// Rollback refs and set Error status so the instance is in a diagnosable state
820-
providerRef.set(oldProvider) *>
846+
// Rollback refs and set Error status so the instance is in a diagnosable state. Stamp
847+
// `recentSwapFailureAt` BEFORE the statusRef write so the async PROVIDER_READY handler sees a fresh
848+
// failure timestamp and skips overwriting Error within the guard window.
849+
ZIO.succeed(recentSwapFailureAt.set(java.lang.System.currentTimeMillis())) *>
850+
providerRef.set(oldProvider) *>
821851
providerNameRef.set(oldName) *>
822852
state.statusRef.set(ProviderStatus.Error)
823853
)

0 commit comments

Comments
 (0)