Skip to content

Commit e95148d

Browse files
authored
Mitigate intermittent SSL runner timeouts on FreeBSD CI (#3171)
### Description of changes: The FreeBSD 15.0 (x86-64) CI job fails intermittently on CBC record splitting tests (`CBCRecordSplitting-AES128`, `CBCRecordSplittingPartialWrite-AES128`) with runner-side TCP timeouts like `read tcp [::1]:...->[::1]:...: i/o timeout`. The QEMU-emulated FreeBSD VM is occasionally too slow to respond within the runner's default 30-second idle timeout. The runner already has retry logic for shim-side idle timeouts and transient signals, but runner-side TCP timeouts (`net.Error` with `Timeout() == true`) fall through without a retry — the test just fails. This PR does two things: 1. Increases the idle timeout to 120s and enables retry-on-timeout for the FreeBSD CI job. 2. Adds a new `-retry-on-timeout` flag to the SSL test runner that retries tests on runner-side TCP timeouts. A real bug will fail again on the retry. ### Call-outs: The `-retry-on-timeout` flag is off by default. It's gated behind `AWS_LC_SSL_RUNNER_RETRY_ON_TIMEOUT` at the CMake level, currently only enabled for FreeBSD. If similar flakiness shows up on NetBSD or OpenBSD, we can add the same env vars to those jobs. This PR also fixes a pre-existing variable shadowing bug in the retry block: `shim, err := newShimProcess(...)` was creating a new `shim` scoped to the `if` block, so after a successful retry the code would still read `stdout`/`stderr` from the original failed shim. This caused retried tests to fail with `unexpected error output` even when the retry passed. Changed to `shim, err = ...` to reassign the outer variable. ### Testing: This addresses a CI flakiness issue — the fix will be validated by observing the FreeBSD job over subsequent runs. The retry logic is structurally identical to the existing `shim.idled` retry path. By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license and the ISC license.
1 parent b624702 commit e95148d

3 files changed

Lines changed: 25 additions & 7 deletions

File tree

.github/workflows/actions-ci.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,9 +711,11 @@ jobs:
711711
uses: cross-platform-actions/action@v0.32.0
712712
env:
713713
AWS_LC_GO_TEST_TIMEOUT: 90m
714+
AWS_LC_SSL_RUNNER_IDLE_TIMEOUT: 120s
715+
AWS_LC_SSL_RUNNER_RETRY_ON_TIMEOUT: 1
714716
GOFLAGS: "-buildvcs=false"
715717
with:
716-
environment_variables: "AWS_LC_GO_TEST_TIMEOUT GOFLAGS"
718+
environment_variables: "AWS_LC_GO_TEST_TIMEOUT AWS_LC_SSL_RUNNER_IDLE_TIMEOUT AWS_LC_SSL_RUNNER_RETRY_ON_TIMEOUT GOFLAGS"
717719
operating_system: freebsd
718720
architecture: ${{ matrix.arch }}
719721
version: ${{ matrix.version }}

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,13 @@ if(DEFINED ENV{AWS_LC_SSL_RUNNER_IDLE_TIMEOUT})
13501350
set(RUNNER_ARGS ${RUNNER_ARGS} "-idle-timeout" "$ENV{AWS_LC_SSL_RUNNER_IDLE_TIMEOUT}")
13511351
endif()
13521352

1353+
# Allow enabling retry on runner-side TCP timeouts.
1354+
# Useful for resource-constrained environments (e.g. emulated FreeBSD VMs) where
1355+
# transient timeouts do not indicate real bugs.
1356+
if(DEFINED ENV{AWS_LC_SSL_RUNNER_RETRY_ON_TIMEOUT})
1357+
set(RUNNER_ARGS ${RUNNER_ARGS} "-retry-on-timeout")
1358+
endif()
1359+
13531360
if (NOT ${CMAKE_VERSION} VERSION_LESS "3.2")
13541361
# USES_TERMINAL is only available in CMake 3.2 or later.
13551362
set(MAYBE_USES_TERMINAL USES_TERMINAL)

ssl/test/runner/runner.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ var (
6969
includeDisabled = flag.Bool("include-disabled", false, "If true, also runs disabled tests.")
7070
repeatUntilFailure = flag.Bool("repeat-until-failure", false, "If true, the first selected test will be run repeatedly until failure.")
7171
// Added by aws-lc
72+
retryOnTimeout = flag.Bool("retry-on-timeout", false, "If true, retry tests that fail due to a runner-side TCP timeout. Useful for resource-constrained environments (e.g. emulated VMs).")
7273
sslTransferConfig = flag.String("ssl-transfer-test-file", "", "A path to file which includes the test names that can be converted for SSL transfer.")
7374
sslFuzzSeedDir = flag.String("ssl-fuzz-seed-dir", "", "The directory in which to write the output of |SSL_to_bytes|.")
7475
testCaseStartIndex = flag.Int("test-case-start-index", -1, "If non-negative, test case is filtered in if the index in |testCases| >= test-case-start-index.")
@@ -1671,17 +1672,25 @@ func runTest(dispatcher *shimDispatcher, statusChan chan statusMsg, test *testCa
16711672
var retryReason string
16721673
if shim.idled {
16731674
retryReason = "idle timeout"
1674-
} else if exitErr, ok := childErr.(*exec.ExitError); ok {
1675-
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.Signaled() {
1676-
switch status.Signal() {
1677-
case syscall.SIGABRT, syscall.SIGKILL, syscall.SIGTERM, syscall.SIGPIPE:
1678-
retryReason = fmt.Sprintf("signal: %s", status.Signal())
1675+
} else if *retryOnTimeout {
1676+
if netErr, ok := localErr.(net.Error); ok && netErr.Timeout() {
1677+
retryReason = "local timeout"
1678+
}
1679+
}
1680+
if retryReason == "" {
1681+
if exitErr, ok := childErr.(*exec.ExitError); ok {
1682+
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.Signaled() {
1683+
switch status.Signal() {
1684+
case syscall.SIGABRT, syscall.SIGKILL, syscall.SIGTERM, syscall.SIGPIPE:
1685+
retryReason = fmt.Sprintf("signal: %s", status.Signal())
1686+
}
16791687
}
16801688
}
16811689
}
16821690
if retryReason != "" {
16831691
fmt.Fprintf(os.Stderr, "Retrying %s (%s)\n", test.name, retryReason)
1684-
shim, err := newShimProcess(dispatcher, shimPath, flags, env)
1692+
var err error
1693+
shim, err = newShimProcess(dispatcher, shimPath, flags, env)
16851694
if err != nil {
16861695
return err
16871696
}

0 commit comments

Comments
 (0)