Skip to content

Commit 70bf68c

Browse files
authored
fix(ci): eliminate chronic E2E timeout — Runtime.Close Docker-verify waste (#478)
* fix(ci): skip Docker-cleanup verification when isolation unused Root cause of the chronic E2E red (failing on every main commit for weeks; v0.31.0/.1 shipped through it): Runtime.Close() and Server.Stop() always ran a Docker container-cleanup verification — a docker ps probe + a 15x1s poll loop + 2s sleep (~17s) — even when no Docker isolation was ever used. Every runtime-constructing unit test paid ~17s in t.Cleanup; across ~dozens of tests under -race the internal/runtime package hung past 12 min, blowing CI's -race -timeout 2m step. Fix: add Manager.UsesDockerIsolation() (same predicate as the Docker recovery monitor) and gate both cleanup-verification blocks on it. No isolation => no managed containers => nothing to verify. Production behavior with Docker isolation is unchanged. Also add testing.Short() guards to the heaviest property/timing tests (TestRapidQuarantineStateMachine ~270s, TestRapidInvariant ~70s, and five ~18s timing tests) so the fast unit step stays lean; they still run in the main-only stress-tests job. Result (local, exact CI condition -short -race -timeout 2m): - internal/runtime: hang >12m -> ok 20s - internal/server: slow -> ok 18s - internal/upstream/contracts: ok Related #468 * ci(e2e): -short the fast unit step; heavy tests move to stress job Pairs with the Runtime.Close Docker-verify fix. The fast unit step now passes -short (activating the testing.Short guards) with a 4m race timeout for headroom; the heavy property/timing tests run unguarded in the main-only stress-tests job so coverage is preserved. Related #468
1 parent ff03db9 commit 70bf68c

9 files changed

Lines changed: 56 additions & 8 deletions

File tree

.github/workflows/e2e-tests.yml

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,10 @@ jobs:
9191
run: go build -tags nogui -o mcpproxy ./cmd/mcpproxy
9292

9393
- name: Run unit tests
94-
run: go test -v -race -timeout 2m -skip "Binary|MCP|E2E|TestInfoEndpoint|TestGracefulShutdownNoPanic|TestSocketInfoEndpoint" ./internal/...
94+
# -short skips the heavy property/timing tests (TestRapidQuarantine* etc.);
95+
# they run unguarded in the main-only stress-tests job below so coverage
96+
# is preserved. Bumped to 4m for headroom under -race on slow runners.
97+
run: go test -v -short -race -timeout 4m -skip "Binary|MCP|E2E|TestInfoEndpoint|TestGracefulShutdownNoPanic|TestSocketInfoEndpoint" ./internal/...
9598

9699
# Binary tests are too flaky in CI due to server startup timing issues
97100
# All Binary tests consistently timeout waiting for server to be ready
@@ -325,21 +328,21 @@ jobs:
325328
if [ -d "${{ matrix.log_path_check }}" ]; then
326329
echo "✓ Log directory created successfully"
327330
ls -la "${{ matrix.log_path_check }}"
328-
331+
329332
# Check if log file exists and has content
330333
if [ -f "${{ matrix.log_path_check }}/main.log" ]; then
331334
echo "✓ Log file created successfully"
332335
echo "Log file size: $(wc -c < "${{ matrix.log_path_check }}/main.log") bytes"
333336
echo "First few lines:"
334337
head -3 "${{ matrix.log_path_check }}/main.log"
335-
338+
336339
# Verify log contains expected content
337340
if grep -q "Log directory configured" "${{ matrix.log_path_check }}/main.log"; then
338341
echo "✓ Log contains expected startup messages"
339342
else
340343
echo "⚠ Log missing expected startup messages"
341344
fi
342-
345+
343346
if grep -q "${{ matrix.log_standard }}" "${{ matrix.log_path_check }}/main.log"; then
344347
echo "✓ Log contains OS standard compliance information"
345348
else
@@ -476,6 +479,13 @@ jobs:
476479
mkdir -p web/frontend
477480
cp -r frontend/dist web/frontend/
478481
482+
- name: Run heavy runtime property/timing tests
483+
run: |
484+
# These are -short-skipped in the fast unit step; run them here
485+
# (unguarded, generous timeout) so invariant coverage is preserved.
486+
go test -v -race -timeout 20m ./internal/runtime \
487+
-run 'TestRapidQuarantineStateMachine|TestRapidInvariant_ChangedNeverAutoApproved|TestApplyConfig_ListenAddressChange|TestIsDockerAvailable_NegativeTTLShorter|TestIsDockerAvailable_PositiveTTLHonored|TestToolCacheInvalidation_DisableServerRemovesTools|TestRuntimeStatusSnapshotReflectsRunningAndListen'
488+
479489
- name: Run concurrent stress tests
480490
run: |
481491
# Run the concurrent test multiple times to catch race conditions

internal/runtime/apply_config_restart_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ import (
1717
// TestApplyConfig_ListenAddressChange tests that listen address changes are saved to disk
1818
// even though they require a restart
1919
func TestApplyConfig_ListenAddressChange(t *testing.T) {
20+
if testing.Short() {
21+
t.Skip("server restart timing test (~18s under -race); runs in the stress-tests CI job")
22+
}
2023
// Create temp directory and config file
2124
tmpDir := t.TempDir()
2225
cfgPath := filepath.Join(tmpDir, "config.json")

internal/runtime/docker_probe_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ func TestIsDockerAvailable_CachesResult(t *testing.T) {
3939
// result expires after 5m (negative TTL) — so users who launch Docker
4040
// after mcpproxy starts see the flip within minutes, not only after restart.
4141
func TestIsDockerAvailable_NegativeTTLShorter(t *testing.T) {
42+
if testing.Short() {
43+
t.Skip("real-time TTL test (~18s under -race); runs in the stress-tests CI job")
44+
}
4245
rt := newTestRuntime(t)
4346

4447
// Prime with a negative result 6 minutes old — past negative TTL (5m)
@@ -65,6 +68,9 @@ func TestIsDockerAvailable_NegativeTTLShorter(t *testing.T) {
6568
// result within 15m is reused (no re-probe) to avoid spending 2s on
6669
// `docker info` on every telemetry heartbeat.
6770
func TestIsDockerAvailable_PositiveTTLHonored(t *testing.T) {
71+
if testing.Short() {
72+
t.Skip("real-time TTL test (~18s under -race); runs in the stress-tests CI job")
73+
}
6874
rt := newTestRuntime(t)
6975

7076
// 10 minutes old — well within positive TTL (15m).

internal/runtime/runtime.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -580,8 +580,12 @@ func (r *Runtime) Close() error {
580580
}
581581
}
582582

583-
// Verify all containers stopped with retry loop (15 attempts = 15 seconds)
584-
if r.upstreamManager.HasDockerContainers() {
583+
// Verify all containers stopped with retry loop (15 attempts = 15 seconds).
584+
// Only when Docker isolation could have launched containers — otherwise
585+
// the `docker ps` probe + this loop are pure waste (and add ~17s per
586+
// Close in test processes, which made internal/runtime exceed CI's
587+
// -race timeout). No isolation ⇒ no managed containers ⇒ nothing to verify.
588+
if r.upstreamManager.UsesDockerIsolation() && r.upstreamManager.HasDockerContainers() {
585589
if r.logger != nil {
586590
r.logger.Warn("Docker containers still running after shutdown, verifying cleanup...")
587591
}

internal/runtime/runtime_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ func TestRuntimeUpdateStatusBroadcasts(t *testing.T) {
7979
}
8080

8181
func TestRuntimeStatusSnapshotReflectsRunningAndListen(t *testing.T) {
82+
if testing.Short() {
83+
t.Skip("server startup/listen timing test (~18s under -race); runs in the stress-tests CI job")
84+
}
8285
rt := newTestRuntime(t)
8386

8487
rt.SetRunning(true)

internal/runtime/tool_invalidation_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ func TestToolCacheInvalidation_MultipleServers(t *testing.T) {
398398
// removes its tools from the search index (Issue #285 fix).
399399
// This ensures disabled servers' tools don't appear in search results.
400400
func TestToolCacheInvalidation_DisableServerRemovesTools(t *testing.T) {
401+
if testing.Short() {
402+
t.Skip("heavy server-lifecycle test (~18s under -race); runs in the stress-tests CI job")
403+
}
401404
tempDir := t.TempDir()
402405
cfg := &config.Config{
403406
DataDir: tempDir,

internal/runtime/tool_quarantine_invariant_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,9 @@ func (m *quarantineModel) checkInvariant(t *rapid.T) {
353353
// TestRapidQuarantineStateMachine runs a property-based state machine test
354354
// using rapid to verify quarantine invariants hold across random action sequences.
355355
func TestRapidQuarantineStateMachine(t *testing.T) {
356+
if testing.Short() {
357+
t.Skip("heavy property test (~270s under -race); runs in the stress-tests CI job")
358+
}
356359
rapid.Check(t, func(t *rapid.T) {
357360
// Set up runtime with quarantine enabled
358361
tempDir, err := os.MkdirTemp("", "quarantine-rapid-*")
@@ -498,6 +501,9 @@ func TestRapidQuarantineStateMachine(t *testing.T) {
498501
// TestRapidInvariant_ChangedNeverAutoApproved is a focused property test that
499502
// specifically targets the "changed→approved without user action" invariant.
500503
func TestRapidInvariant_ChangedNeverAutoApproved(t *testing.T) {
504+
if testing.Short() {
505+
t.Skip("heavy property test (~70s under -race); runs in the stress-tests CI job")
506+
}
501507
rapid.Check(t, func(t *rapid.T) {
502508
tempDir, err := os.MkdirTemp("", "quarantine-changed-*")
503509
if err != nil {

internal/server/server.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,8 +1447,11 @@ func (s *Server) StopServer() error {
14471447
_ = s.logger.Sync()
14481448
}
14491449

1450-
// NEW: Verify all containers stopped with retry loop (instead of arbitrary 3s sleep)
1451-
if s.runtime.UpstreamManager().HasDockerContainers() {
1450+
// NEW: Verify all containers stopped with retry loop (instead of arbitrary 3s sleep).
1451+
// Gated on Docker isolation actually being in use — otherwise the `docker ps`
1452+
// probe + verifyContainersCleanedUp loop are pure waste (and add ~17s per
1453+
// Stop in test processes). No isolation ⇒ no managed containers ⇒ nothing to verify.
1454+
if s.runtime.UpstreamManager().UsesDockerIsolation() && s.runtime.UpstreamManager().HasDockerContainers() {
14521455
s.logger.Warn("STOPSERVER - Docker containers still running, verifying cleanup...")
14531456
_ = s.logger.Sync()
14541457
s.verifyContainersCleanedUp(cleanupCtx)

internal/upstream/manager.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,16 @@ func (m *Manager) shouldEnableDockerRecovery() bool {
234234
return false
235235
}
236236

237+
// UsesDockerIsolation reports whether this manager could have launched
238+
// Docker-isolated containers (global isolation on, a per-server isolation, or
239+
// a docker command). When false, no container cleanup verification is needed
240+
// on shutdown — shelling out to `docker ps` would be pure waste (and, in test
241+
// processes, adds ~17s/Close via the verification loop). Same predicate the
242+
// Docker recovery monitor uses, so behavior stays consistent.
243+
func (m *Manager) UsesDockerIsolation() bool {
244+
return m.shouldEnableDockerRecovery()
245+
}
246+
237247
// SetLogConfig sets the logging configuration for upstream server loggers
238248
func (m *Manager) SetLogConfig(logConfig *config.LogConfig) {
239249
m.mu.Lock()

0 commit comments

Comments
 (0)