Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions src/Reactor/Hosting/ReactorHost.cs
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ private void OnColorValuesChanged(
/// RenderLoop callbacks and Low-priority re-renders all complete before returning.
/// Used by test harnesses to replace blind Task.Delay waits.
/// </summary>
public Task WaitForIdleAsync(int maxYields = 10)
public Task WaitForIdleAsync(int maxYields = 50)
{
if (_disposed) return Task.CompletedTask;
if (_renderPending == 0 && !_isRendering && !_needsRerender)
Expand All @@ -891,15 +891,28 @@ public Task WaitForIdleAsync(int maxYields = 10)
int yields = 0;
void CheckIdle()
Comment thread
codemonkeychris marked this conversation as resolved.
{
if (_disposed || ++yields > maxYields ||
(_renderPending == 0 && !_isRendering && !_needsRerender))
if (_disposed)
{
tcs.TrySetResult();
return;
}
else
if (_renderPending == 0 && !_isRendering && !_needsRerender)
{
_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle);
tcs.TrySetResult();
return;
}
if (++yields > maxYields)
{
// Returning early here is the classic flake source: callers
// (e.g. selftest Harness.Render) move on against a half-settled
// tree. Log so the next flake is greppable instead of silent.
Debug.WriteLine(
$"[Reactor.WaitForIdle] yield cap hit ({maxYields}); " +
$"renderPending={_renderPending} isRendering={_isRendering} needsRerender={_needsRerender}");
tcs.TrySetResult();
return;
}
_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle);
}
_dispatcherQueue.TryEnqueue(DispatcherQueuePriority.Low, CheckIdle);
return tcs.Task;
Comment thread
codemonkeychris marked this conversation as resolved.
Outdated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,10 +525,12 @@ internal class EventSubscriptionLeakBaseline(Harness h) : SelfTestFixtureBase(h)
{
// 100 mount/unmount cycles × 2 Harness.Render() each = 200 renders + 200
// reconcile passes. Locally this runs ~15s; CI VMs under contention have
// been measured at 2-4× slower per INVESTIGATION.md Cluster T, easily
// overshooting the prior 30s budget on a heavy iteration. The cap exists
// to catch a hung fixture, not to set a perf target.
public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(60);
// been measured at 2-4× slower per INVESTIGATION.md Cluster T (i.e. up to
// ~60s on a heavy iteration). Prior 60s budget tripped the watchdog once
// in 500 stress iterations; 120s gives margin without turning a real hang
// into a long wait — the per-cycle heartbeat checks below make true hangs
// surface within a few seconds via the watchdog's per-check progress signal.
public override TimeSpan FixtureTimeout => TimeSpan.FromSeconds(120);

public override async Task RunAsync()
{
Expand All @@ -547,6 +549,7 @@ public override async Task RunAsync()
// Drain to an empty host.
host.Mount(_ => TextBlock("warmup-done"));
await Harness.Render();
H.Check("Reliability_LeakBaseline_WarmupComplete", true);

// Force GC so the baseline reflects steady state. Marshal off
// the UI dispatcher to avoid a finalizer-deadlock on UI-thread-
Expand Down Expand Up @@ -576,6 +579,10 @@ await Task.Run(() =>
await Harness.Render();
host.Mount(_ => TextBlock($"between-{i}"));
await Harness.Render();
// Heartbeat every 25 cycles so the watchdog sees forward progress
// (it only fires when no H.Check has printed `ok` for 60s).
if ((i + 1) % 25 == 0)
Comment thread
codemonkeychris marked this conversation as resolved.
Outdated
H.Check($"Reliability_LeakBaseline_Cycle{i + 1}Progress", true);
}

await Task.Run(() =>
Expand Down
6 changes: 4 additions & 2 deletions tests/Reactor.AppTests.Host/SelfTest/SelfTestFixtureBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ internal abstract class SelfTestFixtureBase
// ~28 s on the worst tick. Previously several fixtures had explicit
// 30 s overrides (PR #397, #399); raising the default folds them in
// and prevents new sister fixtures from inheriting a budget that
// doesn't survive CI variance. The host-level HangWatchdogLoop at
// 60 s (SelfTestRunner.cs) continues to catch true dispatcher hangs.
// doesn't survive CI variance. The host-level HangWatchdogLoop in
// SelfTestRunner.cs uses max(60 s, FixtureTimeout + 30 s) so a
// fixture's own graceful timeout always gets first crack, and the
// dispatcher-starvation FailFast only fires after that.
public virtual TimeSpan FixtureTimeout => TimeSpan.FromSeconds(30);

public abstract Task RunAsync();
Expand Down
43 changes: 30 additions & 13 deletions tests/Reactor.AppTests.Host/SelfTest/SelfTestRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,18 @@ private static TimeSpan ResolveHangTimeout()

// Single immutable progress record — published atomically via
// Volatile.Read/Write so the watchdog can never read a mixed
// (new-name, old-timestamp) state.
private sealed record FixtureProgress(string Name, long StartTimestamp);
// (new-name, old-timestamp) state. HangThreshold is per-fixture so
// long-budget fixtures (e.g. EventSubscriptionLeakBaseline at 120 s)
// don't trip a global 60 s ceiling.
private sealed record FixtureProgress(string Name, long StartTimestamp, TimeSpan HangThreshold);
private static FixtureProgress? _currentFixture;

// Minimum slack between a fixture's own timeout and the watchdog.
// The watchdog's job is to FailFast (dumpable) when the fixture's own
// timeout couldn't fire because the dispatcher itself is stuck —
// i.e. only after the graceful timeout had its chance.
private static readonly TimeSpan HangSlack = TimeSpan.FromSeconds(30);

// Fixtures known to assert-fail under NativeAOT, captured by running
// tests/Reactor.AppTests.Host/probe-aot-skips.ps1 against the AOT-published
// Host. As of WindowsAppSDK#6394 workaround (see Reactor.AppTests.Host.csproj
Expand Down Expand Up @@ -234,31 +242,40 @@ public static void RunAll()
continue;
}

// Publish progress to the off-dispatcher watchdog so
// it can identify the in-flight fixture if the
// dispatcher gets blocked.
Volatile.Write(ref _currentFixture,
new FixtureProgress(fixtureName, Stopwatch.GetTimestamp()));

int failuresBefore = harness.Failures;
bool crashed = false;
try
{
var fixture = SelfTestFixtureRegistry.Create(fixtureName, harness);
if (fixture is null)
{
// Publish a placeholder so the watchdog has a
// name to report if anything goes wrong before
// the next fixture starts.
Volatile.Write(ref _currentFixture,
new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), HangTimeout));
Console.WriteLine($"not ok {testIndex} {fixtureName} - fixture not found");
harness.RecordFailure();
crashed = true;
}
else
{
var timeout = fixture.FixtureTimeout;
// Per-fixture hang threshold: at least the
// global floor, and always strictly past the
// fixture's own graceful timeout so the
// watchdog only fires when that timeout
// couldn't (i.e. dispatcher truly stuck).
var perFixtureHang = timeout + HangSlack;
if (perFixtureHang < HangTimeout) perFixtureHang = HangTimeout;
Volatile.Write(ref _currentFixture,
new FixtureProgress(fixtureName, Stopwatch.GetTimestamp(), perFixtureHang));
Comment thread
codemonkeychris marked this conversation as resolved.
Outdated

Console.WriteLine($"# Running: {fixtureName}");
// Flush so the parent harness can attribute a
// hang to this fixture by name even if the
// child terminates abruptly afterward.
Console.Out.Flush();
var timeout = fixture.FixtureTimeout;
var runTask = fixture.RunAsync();
var timeoutTask = Task.Delay(timeout);
var completed = await Task.WhenAny(runTask, timeoutTask);
Expand Down Expand Up @@ -343,11 +360,11 @@ private static void HangWatchdogLoop()
if (progress is null) continue;

var elapsed = Stopwatch.GetElapsedTime(progress.StartTimestamp);
if (elapsed < HangTimeout) continue;
if (elapsed < progress.HangThreshold) continue;

// We are >= HangTimeout into a fixture and the dispatcher hasn't
// moved on. Emit a structured signal, flush, and FailFast so a
// Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1).
// We are past the per-fixture hang threshold and the dispatcher
// hasn't moved on. Emit a structured signal, flush, and FailFast
// so a Watson/.NET minidump is produced (when DOTNET_DbgEnableMiniDump=1).
var elapsedSec = (int)elapsed.TotalSeconds;
var message =
$"Bail out! HANG_DETECTED: {progress.Name} ran {elapsedSec}s " +
Expand Down
Loading