From 8aefd0e5afa1ddbad7fefe24a8b3a1f52a4f7ba0 Mon Sep 17 00:00:00 2001 From: Chris Anderson Date: Wed, 27 May 2026 09:22:48 -0700 Subject: [PATCH 1/2] fix(stress): per-fixture hang watchdog + WaitForIdle cap raise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two race-driven flakes surfaced in the 500x selftest stress run on main (GitHub Actions run 26513480330): 4/10000 iterations failed, with 2 of 4 in NativeDocking_* fixtures. WaitForIdleAsync silently returned after 10 low-priority dispatcher yields whether or not the render loop had settled. Under CI VM contention this dropped tests onto a half-rendered tree, producing e.g. SplitDocArea_CloseNonLast finding no `body:d1` on iter 3/500. Raise the default cap to 50 and log a Debug.WriteLine when it hits, so the next flake is greppable instead of silent. The host-level HangWatchdogLoop used a global 60 s constant — equal to EventSubscriptionLeakBaseline's own FixtureTimeout. The two raced on the worst-case CI tick (~60 s for 200 renders + 200 reconciles) and the watchdog won, FailFasting the shard instead of letting the fixture's graceful timeout fire first. Make the watchdog threshold per-fixture: max(60 s, FixtureTimeout + 30 s). The fixture's own budget always gets first crack; dispatcher-starvation FailFast only fires after that. Also: bump EventSubscriptionLeakBaseline FixtureTimeout 60 → 120 s (auto-bumps its watchdog to 150 s under the new rule), and add heartbeat H.Check calls every 25 cycles so a future hang reveals which loop window it lived in rather than just "no progress". Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Reactor/Hosting/ReactorHost.cs | 23 +++++++--- .../NativeDockingReliabilityFixture.cs | 15 +++++-- .../SelfTest/SelfTestFixtureBase.cs | 6 ++- .../SelfTest/SelfTestRunner.cs | 43 +++++++++++++------ 4 files changed, 63 insertions(+), 24 deletions(-) diff --git a/src/Reactor/Hosting/ReactorHost.cs b/src/Reactor/Hosting/ReactorHost.cs index ce97b3102..01d8ad5e5 100644 --- a/src/Reactor/Hosting/ReactorHost.cs +++ b/src/Reactor/Hosting/ReactorHost.cs @@ -881,7 +881,7 @@ private void OnColorValuesChanged( /// RenderLoop callbacks and Low-priority re-renders all complete before returning. /// Used by test harnesses to replace blind Task.Delay waits. /// - public Task WaitForIdleAsync(int maxYields = 10) + public Task WaitForIdleAsync(int maxYields = 50) { if (_disposed) return Task.CompletedTask; if (_renderPending == 0 && !_isRendering && !_needsRerender) @@ -891,15 +891,28 @@ public Task WaitForIdleAsync(int maxYields = 10) int yields = 0; void CheckIdle() { - if (_disposed || ++yields > maxYields || - (_renderPending == 0 && !_isRendering && !_needsRerender)) + if (_disposed) { tcs.TrySetResult(); + return; } - else + if (_renderPending == 0 && !_isRendering && !_needsRerender) { - _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); + tcs.TrySetResult(); + return; + } + if (++yields > maxYields) + { + // Returning early here is the classic flake source: callers + // (e.g. selftest Harness.Render) move on against a half-settled + // tree. Log so the next flake is greppable instead of silent. + Debug.WriteLine( + $"[Reactor.WaitForIdle] yield cap hit ({maxYields}); " + + $"renderPending={_renderPending} isRendering={_isRendering} needsRerender={_needsRerender}"); + tcs.TrySetResult(); + return; } + _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); } _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); return tcs.Task; diff --git a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs index ef69a2e03..af20df929 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs @@ -525,10 +525,12 @@ internal class EventSubscriptionLeakBaseline(Harness h) : SelfTestFixtureBase(h) { // 100 mount/unmount cycles × 2 Harness.Render() each = 200 renders + 200 // reconcile passes. Locally this runs ~15s; CI VMs under contention have - // been measured at 2-4× slower per INVESTIGATION.md Cluster T, easily - // overshooting the prior 30s budget on a heavy iteration. The cap exists - // to catch a hung fixture, not to set a perf target. - public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(60); + // been measured at 2-4× slower per INVESTIGATION.md Cluster T (i.e. up to + // ~60s on a heavy iteration). Prior 60s budget tripped the watchdog once + // in 500 stress iterations; 120s gives margin without turning a real hang + // into a long wait — the per-cycle heartbeat checks below make true hangs + // surface within a few seconds via the watchdog's per-check progress signal. + public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(120); public override async Task RunAsync() { @@ -547,6 +549,7 @@ public override async Task RunAsync() // Drain to an empty host. host.Mount(_ => TextBlock("warmup-done")); await Harness.Render(); + H.Check("Reliability_LeakBaseline_WarmupComplete", true); // Force GC so the baseline reflects steady state. Marshal off // the UI dispatcher to avoid a finalizer-deadlock on UI-thread- @@ -576,6 +579,10 @@ await Task.Run(() => await Harness.Render(); host.Mount(_ => TextBlock($"between-{i}")); await Harness.Render(); + // Heartbeat every 25 cycles so the watchdog sees forward progress + // (it only fires when no H.Check has printed `ok` for 60s). + if ((i + 1) % 25 == 0) + H.Check($"Reliability_LeakBaseline_Cycle{i + 1}Progress", true); } await Task.Run(() => diff --git a/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs b/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs index c8b0af036..32365258a 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs @@ -15,8 +15,10 @@ internal abstract class SelfTestFixtureBase // ~28 s on the worst tick. Previously several fixtures had explicit // 30 s overrides (PR #397, #399); raising the default folds them in // and prevents new sister fixtures from inheriting a budget that - // doesn't survive CI variance. The host-level HangWatchdogLoop at - // 60 s (SelfTestRunner.cs) continues to catch true dispatcher hangs. + // doesn't survive CI variance. The host-level HangWatchdogLoop in + // SelfTestRunner.cs uses max(60 s, FixtureTimeout + 30 s) so a + // fixture's own graceful timeout always gets first crack, and the + // dispatcher-starvation FailFast only fires after that. public virtual TimeSpan FixtureTimeout => TimeSpan.FromSeconds(30); public abstract Task RunAsync(); diff --git a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs index 46567a2c6..4b3d6b0c2 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs @@ -52,10 +52,18 @@ private static TimeSpan ResolveHangTimeout() // Single immutable progress record — published atomically via // Volatile.Read/Write so the watchdog can never read a mixed - // (new-name, old-timestamp) state. - private sealed record FixtureProgress(string Name, long StartTimestamp); + // (new-name, old-timestamp) state. HangThreshold is per-fixture so + // long-budget fixtures (e.g. EventSubscriptionLeakBaseline at 120 s) + // don't trip a global 60 s ceiling. + private sealed record FixtureProgress(string Name, long StartTimestamp, TimeSpan HangThreshold); private static FixtureProgress? _currentFixture; + // Minimum slack between a fixture's own timeout and the watchdog. + // The watchdog's job is to FailFast (dumpable) when the fixture's own + // timeout couldn't fire because the dispatcher itself is stuck — + // i.e. only after the graceful timeout had its chance. + private static readonly TimeSpan HangSlack = TimeSpan.FromSeconds(30); + // Fixtures known to assert-fail under NativeAOT, captured by running // tests/Reactor.AppTests.Host/probe-aot-skips.ps1 against the AOT-published // Host. As of WindowsAppSDK#6394 workaround (see Reactor.AppTests.Host.csproj @@ -234,12 +242,6 @@ public static void RunAll() continue; } - // Publish progress to the off-dispatcher watchdog so - // it can identify the in-flight fixture if the - // dispatcher gets blocked. - Volatile.Write(ref _currentFixture, - new FixtureProgress(fixtureName, Stopwatch.GetTimestamp())); - int failuresBefore = harness.Failures; bool crashed = false; try @@ -247,18 +249,33 @@ public static void RunAll() var fixture = SelfTestFixtureRegistry.Create(fixtureName, harness); if (fixture is null) { + // Publish a placeholder so the watchdog has a + // name to report if anything goes wrong before + // the next fixture starts. + Volatile.Write(ref _currentFixture, + new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), HangTimeout)); Console.WriteLine($"not ok {testIndex} {fixtureName} - fixture not found"); harness.RecordFailure(); crashed = true; } else { + var timeout = fixture.FixtureTimeout; + // Per-fixture hang threshold: at least the + // global floor, and always strictly past the + // fixture's own graceful timeout so the + // watchdog only fires when that timeout + // couldn't (i.e. dispatcher truly stuck). + var perFixtureHang = timeout + HangSlack; + if (perFixtureHang < HangTimeout) perFixtureHang = HangTimeout; + Volatile.Write(ref _currentFixture, + new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), perFixtureHang)); + Console.WriteLine($"# Running: {fixtureName}"); // Flush so the parent harness can attribute a // hang to this fixture by name even if the // child terminates abruptly afterward. Console.Out.Flush(); - var timeout = fixture.FixtureTimeout; var runTask = fixture.RunAsync(); var timeoutTask = Task.Delay(timeout); var completed = await Task.WhenAny(runTask, timeoutTask); @@ -343,11 +360,11 @@ private static void HangWatchdogLoop() if (progress is null) continue; var elapsed = Stopwatch.GetElapsedTime(progress.StartTimestamp); - if (elapsed < HangTimeout) continue; + if (elapsed < progress.HangThreshold) continue; - // We are >= HangTimeout into a fixture and the dispatcher hasn't - // moved on. Emit a structured signal, flush, and FailFast so a - // Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1). + // We are past the per-fixture hang threshold and the dispatcher + // hasn't moved on. Emit a structured signal, flush, and FailFast + // so a Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1). var elapsedSec = (int)elapsed.TotalSeconds; var message = $"Bail out! HANG_DETECTED: {progress.Name} ran {elapsedSec}s " + From 3ee35aa6c6ce34d01e49fb2439e2e9115454d1a9 Mon Sep 17 00:00:00 2001 From: Chris Anderson Date: Wed, 27 May 2026 09:58:18 -0700 Subject: [PATCH 2/2] address CR feedback from Copilot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - WaitForIdleAsync: use RunContinuationsAsynchronously so await continuations don't run inline on the dispatcher at Low priority (defeating the yield loop's purpose); honour TryEnqueue failures (queue shutdown) by completing the TCS rather than hanging. - SelfTestRunner: publish a baseline FixtureProgress *before* calling SelfTestFixtureRegistry.Create, so the watchdog can still attribute a hang if construction itself blocks. Upgrade the per-fixture threshold once FixtureTimeout is known. - Fix misleading comments in EventSubscriptionLeakBaseline that claimed the heartbeat H.Check calls reset the watchdog — they don't; the watchdog uses elapsed-since-fixture-start. Heartbeats are log breadcrumbs only. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Reactor/Hosting/ReactorHost.cs | 19 ++++++++++++++++--- .../NativeDockingReliabilityFixture.cs | 15 +++++++++------ .../SelfTest/SelfTestRunner.cs | 15 +++++++++------ 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/Reactor/Hosting/ReactorHost.cs b/src/Reactor/Hosting/ReactorHost.cs index 01d8ad5e5..6053ddbdf 100644 --- a/src/Reactor/Hosting/ReactorHost.cs +++ b/src/Reactor/Hosting/ReactorHost.cs @@ -887,7 +887,11 @@ public Task WaitForIdleAsync(int maxYields = 50) if (_renderPending == 0 && !_isRendering && !_needsRerender) return Task.CompletedTask; - var tcs = new TaskCompletionSource(); + // RunContinuationsAsynchronously: TrySetResult is called from a + // dispatcher callback, and without this flag any await continuation + // would run inline on the dispatcher at Low priority — re-entering + // UI logic inside the yield loop and partially defeating its purpose. + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); int yields = 0; void CheckIdle() { @@ -912,9 +916,18 @@ void CheckIdle() tcs.TrySetResult(); return; } - _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); + if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle)) + { + // Queue refused enqueue (shutdown). Complete rather than + // hang the caller forever. + tcs.TrySetResult(); + } + } + if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle)) + { + // Same fallback for the initial enqueue. + tcs.TrySetResult(); } - _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); return tcs.Task; } diff --git a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs index af20df929..944f9597e 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs @@ -526,10 +526,10 @@ internal class EventSubscriptionLeakBaseline(Harness h) : SelfTestFixtureBase(h) // 100 mount/unmount cycles × 2 Harness.Render() each = 200 renders + 200 // reconcile passes. Locally this runs ~15s; CI VMs under contention have // been measured at 2-4× slower per INVESTIGATION.md Cluster T (i.e. up to - // ~60s on a heavy iteration). Prior 60s budget tripped the watchdog once - // in 500 stress iterations; 120s gives margin without turning a real hang - // into a long wait — the per-cycle heartbeat checks below make true hangs - // surface within a few seconds via the watchdog's per-check progress signal. + // ~60s on a heavy iteration). Prior 60s budget tripped the host-level + // HangWatchdogLoop once in 500 stress iterations; 120s gives margin and, + // under the new per-fixture watchdog rule (SelfTestRunner.HangSlack), + // automatically lifts that watchdog threshold to 150s. public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(120); public override async Task RunAsync() @@ -579,8 +579,11 @@ await Task.Run(() => await Harness.Render(); host.Mount(_ => TextBlock($"between-{i}")); await Harness.Render(); - // Heartbeat every 25 cycles so the watchdog sees forward progress - // (it only fires when no H.Check has printed `ok` for 60s). + // Heartbeat every 25 cycles. NOTE: this does NOT reset the + // host-level HangWatchdogLoop — that uses elapsed-since-fixture- + // start, not TAP output. These are log breadcrumbs only, so a + // future hang reveals which 25-cycle window it lived in + // (e.g. "Cycle25Progress: ok" printed but Cycle50 did not). if ((i + 1) % 25 == 0) H.Check($"Reliability_LeakBaseline_Cycle{i + 1}Progress", true); } diff --git a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs index 4b3d6b0c2..d5863a49d 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs @@ -242,6 +242,14 @@ public static void RunAll() continue; } + // Publish a baseline progress record *before* calling + // Create() so the watchdog can attribute a hang even + // if construction itself blocks. We'll upgrade the + // threshold once we know the fixture's own timeout. + var fixtureStart = Stopwatch.GetTimestamp(); + Volatile.Write(ref _currentFixture, + new FixtureProgress(fixtureName, fixtureStart, HangTimeout)); + int failuresBefore = harness.Failures; bool crashed = false; try @@ -249,11 +257,6 @@ public static void RunAll() var fixture = SelfTestFixtureRegistry.Create(fixtureName, harness); if (fixture is null) { - // Publish a placeholder so the watchdog has a - // name to report if anything goes wrong before - // the next fixture starts. - Volatile.Write(ref _currentFixture, - new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), HangTimeout)); Console.WriteLine($"not ok {testIndex} {fixtureName} - fixture not found"); harness.RecordFailure(); crashed = true; @@ -269,7 +272,7 @@ public static void RunAll() var perFixtureHang = timeout + HangSlack; if (perFixtureHang < HangTimeout) perFixtureHang = HangTimeout; Volatile.Write(ref _currentFixture, - new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), perFixtureHang)); + new FixtureProgress(fixtureName, fixtureStart, perFixtureHang)); Console.WriteLine($"# Running: {fixtureName}"); // Flush so the parent harness can attribute a