diff --git a/src/Reactor/Hosting/ReactorHost.cs b/src/Reactor/Hosting/ReactorHost.cs index ce97b310..6053ddbd 100644 --- a/src/Reactor/Hosting/ReactorHost.cs +++ b/src/Reactor/Hosting/ReactorHost.cs @@ -881,27 +881,53 @@ private void OnColorValuesChanged( /// RenderLoop callbacks and Low-priority re-renders all complete before returning. /// Used by test harnesses to replace blind Task.Delay waits. /// - public Task WaitForIdleAsync(int maxYields = 10) + public Task WaitForIdleAsync(int maxYields = 50) { if (_disposed) return Task.CompletedTask; if (_renderPending == 0 && !_isRendering && !_needsRerender) return Task.CompletedTask; - var tcs = new TaskCompletionSource(); + // RunContinuationsAsynchronously: TrySetResult is called from a + // dispatcher callback, and without this flag any await continuation + // would run inline on the dispatcher at Low priority — re-entering + // UI logic inside the yield loop and partially defeating its purpose. + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); int yields = 0; void CheckIdle() { - if (_disposed || ++yields > maxYields || - (_renderPending == 0 && !_isRendering && !_needsRerender)) + if (_disposed) { tcs.TrySetResult(); + return; } - else + if (_renderPending == 0 && !_isRendering && !_needsRerender) { - _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); + tcs.TrySetResult(); + return; + } + if (++yields > maxYields) + { + // Returning early here is the classic flake source: callers + // (e.g. selftest Harness.Render) move on against a half-settled + // tree. Log so the next flake is greppable instead of silent. + Debug.WriteLine( + $"[Reactor.WaitForIdle] yield cap hit ({maxYields}); " + + $"renderPending={_renderPending} isRendering={_isRendering} needsRerender={_needsRerender}"); + tcs.TrySetResult(); + return; } + if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle)) + { + // Queue refused enqueue (shutdown). Complete rather than + // hang the caller forever. + tcs.TrySetResult(); + } + } + if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle)) + { + // Same fallback for the initial enqueue. + tcs.TrySetResult(); } - _dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle); return tcs.Task; } diff --git a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs index ef69a2e0..944f9597 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs @@ -525,10 +525,12 @@ internal class EventSubscriptionLeakBaseline(Harness h) : SelfTestFixtureBase(h) { // 100 mount/unmount cycles × 2 Harness.Render() each = 200 renders + 200 // reconcile passes. Locally this runs ~15s; CI VMs under contention have - // been measured at 2-4× slower per INVESTIGATION.md Cluster T, easily - // overshooting the prior 30s budget on a heavy iteration. The cap exists - // to catch a hung fixture, not to set a perf target. - public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(60); + // been measured at 2-4× slower per INVESTIGATION.md Cluster T (i.e. up to + // ~60s on a heavy iteration). Prior 60s budget tripped the host-level + // HangWatchdogLoop once in 500 stress iterations; 120s gives margin and, + // under the new per-fixture watchdog rule (SelfTestRunner.HangSlack), + // automatically lifts that watchdog threshold to 150s. + public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(120); public override async Task RunAsync() { @@ -547,6 +549,7 @@ public override async Task RunAsync() // Drain to an empty host. host.Mount(_ => TextBlock("warmup-done")); await Harness.Render(); + H.Check("Reliability_LeakBaseline_WarmupComplete", true); // Force GC so the baseline reflects steady state. Marshal off // the UI dispatcher to avoid a finalizer-deadlock on UI-thread- @@ -576,6 +579,13 @@ await Task.Run(() => await Harness.Render(); host.Mount(_ => TextBlock($"between-{i}")); await Harness.Render(); + // Heartbeat every 25 cycles. NOTE: this does NOT reset the + // host-level HangWatchdogLoop — that uses elapsed-since-fixture- + // start, not TAP output. These are log breadcrumbs only, so a + // future hang reveals which 25-cycle window it lived in + // (e.g. "Cycle25Progress: ok" printed but Cycle50 did not). + if ((i + 1) % 25 == 0) + H.Check($"Reliability_LeakBaseline_Cycle{i + 1}Progress", true); } await Task.Run(() => diff --git a/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs b/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs index c8b0af03..32365258 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs @@ -15,8 +15,10 @@ internal abstract class SelfTestFixtureBase // ~28 s on the worst tick. Previously several fixtures had explicit // 30 s overrides (PR #397, #399); raising the default folds them in // and prevents new sister fixtures from inheriting a budget that - // doesn't survive CI variance. The host-level HangWatchdogLoop at - // 60 s (SelfTestRunner.cs) continues to catch true dispatcher hangs. + // doesn't survive CI variance. The host-level HangWatchdogLoop in + // SelfTestRunner.cs uses max(60 s, FixtureTimeout + 30 s) so a + // fixture's own graceful timeout always gets first crack, and the + // dispatcher-starvation FailFast only fires after that. public virtual TimeSpan FixtureTimeout => TimeSpan.FromSeconds(30); public abstract Task RunAsync(); diff --git a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs index 46567a2c..d5863a49 100644 --- a/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs +++ b/tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs @@ -52,10 +52,18 @@ private static TimeSpan ResolveHangTimeout() // Single immutable progress record — published atomically via // Volatile.Read/Write so the watchdog can never read a mixed - // (new-name, old-timestamp) state. - private sealed record FixtureProgress(string Name, long StartTimestamp); + // (new-name, old-timestamp) state. HangThreshold is per-fixture so + // long-budget fixtures (e.g. EventSubscriptionLeakBaseline at 120 s) + // don't trip a global 60 s ceiling. + private sealed record FixtureProgress(string Name, long StartTimestamp, TimeSpan HangThreshold); private static FixtureProgress? _currentFixture; + // Minimum slack between a fixture's own timeout and the watchdog. + // The watchdog's job is to FailFast (dumpable) when the fixture's own + // timeout couldn't fire because the dispatcher itself is stuck — + // i.e. only after the graceful timeout had its chance. + private static readonly TimeSpan HangSlack = TimeSpan.FromSeconds(30); + // Fixtures known to assert-fail under NativeAOT, captured by running // tests/Reactor.AppTests.Host/probe-aot-skips.ps1 against the AOT-published // Host. As of WindowsAppSDK#6394 workaround (see Reactor.AppTests.Host.csproj @@ -234,11 +242,13 @@ public static void RunAll() continue; } - // Publish progress to the off-dispatcher watchdog so - // it can identify the in-flight fixture if the - // dispatcher gets blocked. + // Publish a baseline progress record *before* calling + // Create() so the watchdog can attribute a hang even + // if construction itself blocks. We'll upgrade the + // threshold once we know the fixture's own timeout. + var fixtureStart = Stopwatch.GetTimestamp(); Volatile.Write(ref _currentFixture, - new FixtureProgress(fixtureName, Stopwatch.GetTimestamp())); + new FixtureProgress(fixtureName, fixtureStart, HangTimeout)); int failuresBefore = harness.Failures; bool crashed = false; @@ -253,12 +263,22 @@ public static void RunAll() } else { + var timeout = fixture.FixtureTimeout; + // Per-fixture hang threshold: at least the + // global floor, and always strictly past the + // fixture's own graceful timeout so the + // watchdog only fires when that timeout + // couldn't (i.e. dispatcher truly stuck). + var perFixtureHang = timeout + HangSlack; + if (perFixtureHang < HangTimeout) perFixtureHang = HangTimeout; + Volatile.Write(ref _currentFixture, + new FixtureProgress(fixtureName, fixtureStart, perFixtureHang)); + Console.WriteLine($"# Running: {fixtureName}"); // Flush so the parent harness can attribute a // hang to this fixture by name even if the // child terminates abruptly afterward. Console.Out.Flush(); - var timeout = fixture.FixtureTimeout; var runTask = fixture.RunAsync(); var timeoutTask = Task.Delay(timeout); var completed = await Task.WhenAny(runTask, timeoutTask); @@ -343,11 +363,11 @@ private static void HangWatchdogLoop() if (progress is null) continue; var elapsed = Stopwatch.GetElapsedTime(progress.StartTimestamp); - if (elapsed < HangTimeout) continue; + if (elapsed < progress.HangThreshold) continue; - // We are >= HangTimeout into a fixture and the dispatcher hasn't - // moved on. Emit a structured signal, flush, and FailFast so a - // Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1). + // We are past the per-fixture hang threshold and the dispatcher + // hasn't moved on. Emit a structured signal, flush, and FailFast + // so a Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1). var elapsedSec = (int)elapsed.TotalSeconds; var message = $"Bail out! HANG_DETECTED: {progress.Name} ran {elapsedSec}s " +