diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ffffa3dc..fd53a0ca1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -265,6 +265,13 @@ jobs: echo "================ CLEANUP COMPLETE ================" - name: Use azure ubuntu archive uses: ./.github/actions/dns-spoof-ubuntu-archive + - name: Start OOM monitor + run: | + sudo dmesg --clear || true + setsid sh -c 'sudo dmesg --follow 2>/dev/null | \ + grep --line-buffered -iE "oom|out of memory|killed process|invoked oom" \ + > /tmp/oom-monitor.log 2>&1' & + echo "OOM_MONITOR_PID=$!" >> "$GITHUB_ENV" - name: Pre-build base images run: | set -eu @@ -284,6 +291,7 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} - name: Run integration tests + id: run-tests run: | set -ex if [ -n "${TEST_SUITE}" ] && [ ! "${TEST_SUITE}" = "other" ]; then @@ -296,6 +304,22 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} TEST_SKIP: ${{ matrix.skip }} + - name: Check for OOM kills + if: always() + run: | + kill -- -"${OOM_MONITOR_PID}" 2>/dev/null || true + mkdir -p /tmp/reports + + if [ -s /tmp/oom-monitor.log ]; then + echo "::warning::OOM kills detected during test run" + cat /tmp/oom-monitor.log + cp /tmp/oom-monitor.log /tmp/reports/oom-monitor.log + else + echo "No OOM kills detected" + fi + + # Capture recent dmesg for context on any failures + sudo dmesg -T 2>/dev/null | tail -200 > /tmp/reports/dmesg-tail.log || true - name: Get traces if: always() run: | @@ -327,15 +351,29 @@ jobs: set -e dir="$(mktemp -d)" - f="${dir}/dockerd.log" - echo "DOCKERD_LOG_PATH=${f}" >> $GITHUB_OUTPUT - sudo journalctl -u docker > "${f}" + echo "DOCKERD_LOG_PATH=${dir}" >> $GITHUB_OUTPUT + + if [ "${{ steps.run-tests.outputs.test_timeout }}" = "true" ]; then + echo "::group::Collecting pprof data from dockerd (test timeout detected)" + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/goroutine-stacks.txt" \ + "http://localhost/debug/pprof/goroutine?debug=2" || true + + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/heap-profile.bin" \ + "http://localhost/debug/pprof/heap" || true + + cp "$(which dockerd)" "${dir}/dockerd" || true + echo "::endgroup::" + fi + + sudo journalctl -u docker > "${dir}/dockerd.log" - name: Upload buildkit logs if: failure() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: e2e-dockerd-logs-${{ matrix.suite }} - path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }} + path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}/* retention-days: 1 unit: diff --git a/cmd/test2json2gha/main.go b/cmd/test2json2gha/main.go index 3c81441bf..55d546de7 100644 --- a/cmd/test2json2gha/main.go +++ b/cmd/test2json2gha/main.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "io" + "iter" "log/slog" "os" "runtime/debug" @@ -73,6 +74,7 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { var wg waitGroup results.markUnfinishedAsTimeout() + signalTimeout(results.Results()) wg.Go(func() { var rf ResultsFormatter @@ -135,6 +137,30 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { return bool(anyFailed), nil } +// signalTimeout writes test_timeout=true to GITHUB_OUTPUT if any test timed out. +// This allows subsequent CI steps to detect that a timeout occurred. +func signalTimeout(results iter.Seq[*TestResult]) { + ghOutput := os.Getenv("GITHUB_OUTPUT") + if ghOutput == "" { + return + } + + for r := range results { + if r.timeout { + f, err := os.OpenFile(ghOutput, os.O_WRONLY|os.O_APPEND, 0) + if err != nil { + slog.Error("Error opening GITHUB_OUTPUT", "error", err) + return + } + if _, err := fmt.Fprintln(f, "test_timeout=true"); err != nil { + slog.Error("Error writing timeout status to GITHUB_OUTPUT", "error", err) + } + f.Close() + return + } + } +} + type waitGroup struct { sync.WaitGroup }