Skip to content

Commit 8acc1af

Browse files
fix(stress): per-fixture hang watchdog + WaitForIdle cap raise (#431)
* fix(stress): per-fixture hang watchdog + WaitForIdle cap raise Two race-driven flakes surfaced in the 500x selftest stress run on main (GitHub Actions run 26513480330): 4/10000 iterations failed, with 2 of 4 in NativeDocking_* fixtures. WaitForIdleAsync silently returned after 10 low-priority dispatcher yields whether or not the render loop had settled. Under CI VM contention this dropped tests onto a half-rendered tree, producing e.g. SplitDocArea_CloseNonLast finding no `body:d1` on iter 3/500. Raise the default cap to 50 and log a Debug.WriteLine when it hits, so the next flake is greppable instead of silent. The host-level HangWatchdogLoop used a global 60 s constant — equal to EventSubscriptionLeakBaseline's own FixtureTimeout. The two raced on the worst-case CI tick (~60 s for 200 renders + 200 reconciles) and the watchdog won, FailFasting the shard instead of letting the fixture's graceful timeout fire first. Make the watchdog threshold per-fixture: max(60 s, FixtureTimeout + 30 s). The fixture's own budget always gets first crack; dispatcher-starvation FailFast only fires after that. Also: bump EventSubscriptionLeakBaseline FixtureTimeout 60 → 120 s (auto-bumps its watchdog to 150 s under the new rule), and add heartbeat H.Check calls every 25 cycles so a future hang reveals which loop window it lived in rather than just "no progress". Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * address CR feedback from Copilot - WaitForIdleAsync: use RunContinuationsAsynchronously so await continuations don't run inline on the dispatcher at Low priority (defeating the yield loop's purpose); honour TryEnqueue failures (queue shutdown) by completing the TCS rather than hanging. - SelfTestRunner: publish a baseline FixtureProgress *before* calling SelfTestFixtureRegistry.Create, so the watchdog can still attribute a hang if construction itself blocks. Upgrade the per-fixture threshold once FixtureTimeout is known. - Fix misleading comments in EventSubscriptionLeakBaseline that claimed the heartbeat H.Check calls reset the watchdog — they don't; the watchdog uses elapsed-since-fixture-start. Heartbeats are log breadcrumbs only. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e19bd0f commit 8acc1af

4 files changed

Lines changed: 82 additions & 24 deletions

File tree

src/Reactor/Hosting/ReactorHost.cs

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -881,27 +881,53 @@ private void OnColorValuesChanged(
881881
/// RenderLoop callbacks and Low-priority re-renders all complete before returning.
882882
/// Used by test harnesses to replace blind Task.Delay waits.
883883
/// </summary>
884-
public Task WaitForIdleAsync(int maxYields = 10)
884+
public Task WaitForIdleAsync(int maxYields = 50)
885885
{
886886
if (_disposed) return Task.CompletedTask;
887887
if (_renderPending == 0 && !_isRendering && !_needsRerender)
888888
return Task.CompletedTask;
889889

890-
var tcs = new TaskCompletionSource();
890+
// RunContinuationsAsynchronously: TrySetResult is called from a
891+
// dispatcher callback, and without this flag any await continuation
892+
// would run inline on the dispatcher at Low priority — re-entering
893+
// UI logic inside the yield loop and partially defeating its purpose.
894+
var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
891895
int yields = 0;
892896
void CheckIdle()
893897
{
894-
if (_disposed || ++yields > maxYields ||
895-
(_renderPending == 0 && !_isRendering && !_needsRerender))
898+
if (_disposed)
896899
{
897900
tcs.TrySetResult();
901+
return;
898902
}
899-
else
903+
if (_renderPending == 0 && !_isRendering && !_needsRerender)
900904
{
901-
_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle);
905+
tcs.TrySetResult();
906+
return;
907+
}
908+
if (++yields > maxYields)
909+
{
910+
// Returning early here is the classic flake source: callers
911+
// (e.g. selftest Harness.Render) move on against a half-settled
912+
// tree. Log so the next flake is greppable instead of silent.
913+
Debug.WriteLine(
914+
$"[Reactor.WaitForIdle] yield cap hit ({maxYields}); " +
915+
$"renderPending={_renderPending} isRendering={_isRendering} needsRerender={_needsRerender}");
916+
tcs.TrySetResult();
917+
return;
902918
}
919+
if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle))
920+
{
921+
// Queue refused enqueue (shutdown). Complete rather than
922+
// hang the caller forever.
923+
tcs.TrySetResult();
924+
}
925+
}
926+
if (!_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle))
927+
{
928+
// Same fallback for the initial enqueue.
929+
tcs.TrySetResult();
903930
}
904-
_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle);
905931
return tcs.Task;
906932
}
907933

tests/Reactor.AppTests.Host/SelfTest/Fixtures/NativeDockingReliabilityFixture.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -525,10 +525,12 @@ internal class EventSubscriptionLeakBaseline(Harness h) : SelfTestFixtureBase(h)
525525
{
526526
// 100 mount/unmount cycles × 2 Harness.Render() each = 200 renders + 200
527527
// reconcile passes. Locally this runs ~15s; CI VMs under contention have
528-
// been measured at 2-4× slower per INVESTIGATION.md Cluster T, easily
529-
// overshooting the prior 30s budget on a heavy iteration. The cap exists
530-
// to catch a hung fixture, not to set a perf target.
531-
public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(60);
528+
// been measured at 2-4× slower per INVESTIGATION.md Cluster T (i.e. up to
529+
// ~60s on a heavy iteration). Prior 60s budget tripped the host-level
530+
// HangWatchdogLoop once in 500 stress iterations; 120s gives margin and,
531+
// under the new per-fixture watchdog rule (SelfTestRunner.HangSlack),
532+
// automatically lifts that watchdog threshold to 150s.
533+
public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(120);
532534

533535
public override async Task RunAsync()
534536
{
@@ -547,6 +549,7 @@ public override async Task RunAsync()
547549
// Drain to an empty host.
548550
host.Mount(_ => TextBlock("warmup-done"));
549551
await Harness.Render();
552+
H.Check("Reliability_LeakBaseline_WarmupComplete", true);
550553

551554
// Force GC so the baseline reflects steady state. Marshal off
552555
// the UI dispatcher to avoid a finalizer-deadlock on UI-thread-
@@ -576,6 +579,13 @@ await Task.Run(() =>
576579
await Harness.Render();
577580
host.Mount(_ => TextBlock($"between-{i}"));
578581
await Harness.Render();
582+
// Heartbeat every 25 cycles. NOTE: this does NOT reset the
583+
// host-level HangWatchdogLoop — that uses elapsed-since-fixture-
584+
// start, not TAP output. These are log breadcrumbs only, so a
585+
// future hang reveals which 25-cycle window it lived in
586+
// (e.g. "Cycle25Progress: ok" printed but Cycle50 did not).
587+
if ((i + 1) % 25 == 0)
588+
H.Check($"Reliability_LeakBaseline_Cycle{i + 1}Progress", true);
579589
}
580590

581591
await Task.Run(() =>

tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ internal abstract class SelfTestFixtureBase
1515
// ~28 s on the worst tick. Previously several fixtures had explicit
1616
// 30 s overrides (PR #397, #399); raising the default folds them in
1717
// and prevents new sister fixtures from inheriting a budget that
18-
// doesn't survive CI variance. The host-level HangWatchdogLoop at
19-
// 60 s (SelfTestRunner.cs) continues to catch true dispatcher hangs.
18+
// doesn't survive CI variance. The host-level HangWatchdogLoop in
19+
// SelfTestRunner.cs uses max(60 s, FixtureTimeout + 30 s) so a
20+
// fixture's own graceful timeout always gets first crack, and the
21+
// dispatcher-starvation FailFast only fires after that.
2022
public virtual TimeSpan FixtureTimeout => TimeSpan.FromSeconds(30);
2123

2224
public abstract Task RunAsync();

tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,18 @@ private static TimeSpan ResolveHangTimeout()
5252

5353
// Single immutable progress record — published atomically via
5454
// Volatile.Read/Write so the watchdog can never read a mixed
55-
// (new-name, old-timestamp) state.
56-
private sealed record FixtureProgress(string Name, long StartTimestamp);
55+
// (new-name, old-timestamp) state. HangThreshold is per-fixture so
56+
// long-budget fixtures (e.g. EventSubscriptionLeakBaseline at 120 s)
57+
// don't trip a global 60 s ceiling.
58+
private sealed record FixtureProgress(string Name, long StartTimestamp, TimeSpan HangThreshold);
5759
private static FixtureProgress? _currentFixture;
5860

61+
// Minimum slack between a fixture's own timeout and the watchdog.
62+
// The watchdog's job is to FailFast (dumpable) when the fixture's own
63+
// timeout couldn't fire because the dispatcher itself is stuck —
64+
// i.e. only after the graceful timeout had its chance.
65+
private static readonly TimeSpan HangSlack = TimeSpan.FromSeconds(30);
66+
5967
// Fixtures known to assert-fail under NativeAOT, captured by running
6068
// tests/Reactor.AppTests.Host/probe-aot-skips.ps1 against the AOT-published
6169
// Host. As of WindowsAppSDK#6394 workaround (see Reactor.AppTests.Host.csproj
@@ -234,11 +242,13 @@ public static void RunAll()
234242
continue;
235243
}
236244

237-
// Publish progress to the off-dispatcher watchdog so
238-
// it can identify the in-flight fixture if the
239-
// dispatcher gets blocked.
245+
// Publish a baseline progress record *before* calling
246+
// Create() so the watchdog can attribute a hang even
247+
// if construction itself blocks. We'll upgrade the
248+
// threshold once we know the fixture's own timeout.
249+
var fixtureStart = Stopwatch.GetTimestamp();
240250
Volatile.Write(ref _currentFixture,
241-
new FixtureProgress(fixtureName, Stopwatch.GetTimestamp()));
251+
new FixtureProgress(fixtureName, fixtureStart, HangTimeout));
242252

243253
int failuresBefore = harness.Failures;
244254
bool crashed = false;
@@ -253,12 +263,22 @@ public static void RunAll()
253263
}
254264
else
255265
{
266+
var timeout = fixture.FixtureTimeout;
267+
// Per-fixture hang threshold: at least the
268+
// global floor, and always strictly past the
269+
// fixture's own graceful timeout so the
270+
// watchdog only fires when that timeout
271+
// couldn't (i.e. dispatcher truly stuck).
272+
var perFixtureHang = timeout + HangSlack;
273+
if (perFixtureHang < HangTimeout) perFixtureHang = HangTimeout;
274+
Volatile.Write(ref _currentFixture,
275+
new FixtureProgress(fixtureName, fixtureStart, perFixtureHang));
276+
256277
Console.WriteLine($"# Running: {fixtureName}");
257278
// Flush so the parent harness can attribute a
258279
// hang to this fixture by name even if the
259280
// child terminates abruptly afterward.
260281
Console.Out.Flush();
261-
var timeout = fixture.FixtureTimeout;
262282
var runTask = fixture.RunAsync();
263283
var timeoutTask = Task.Delay(timeout);
264284
var completed = await Task.WhenAny(runTask, timeoutTask);
@@ -343,11 +363,11 @@ private static void HangWatchdogLoop()
343363
if (progress is null) continue;
344364

345365
var elapsed = Stopwatch.GetElapsedTime(progress.StartTimestamp);
346-
if (elapsed < HangTimeout) continue;
366+
if (elapsed < progress.HangThreshold) continue;
347367

348-
// We are >= HangTimeout into a fixture and the dispatcher hasn't
349-
// moved on. Emit a structured signal, flush, and FailFast so a
350-
// Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1).
368+
// We are past the per-fixture hang threshold and the dispatcher
369+
// hasn't moved on. Emit a structured signal, flush, and FailFast
370+
// so a Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1).
351371
var elapsedSec = (int)elapsed.TotalSeconds;
352372
var message =
353373
$"Bail out! HANG_DETECTED: {progress.Name} ran {elapsedSec}s " +

0 commit comments

Comments
 (0)