From f94abad55fa42924f454db6731bab0fd64a5f334 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Wed, 8 Apr 2026 18:43:06 -0700 Subject: [PATCH 1/3] Use larger runner for (most) integration test suites Signed-off-by: Brian Goff --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ffffa3dc..37ce14259 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,7 +88,7 @@ jobs: fi integration: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.suite == 'other' && 'ubuntu-22.04' || 'ubuntu-latest-4-cores' }} strategy: fail-fast: false matrix: From 86fbcc1011bbca954dfc4bf31d86c48263a64f85 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Thu, 9 Apr 2026 08:00:27 -0700 Subject: [PATCH 2/3] ci: Fix flakiness restarting dockerd After switching to larger runners CI had 3 jobs where dockerd just would not start. Seemingly because we are restarting docker (for config updates) quickly enough such that systemd refuses to restart it. This change resets the fail counter in systemd if docker fails to restart and tries again. Signed-off-by: Brian Goff --- .../dns-spoof-ubuntu-archive/action.yml | 10 +++++++- .github/actions/enable-containerd/action.yml | 8 +++++- .github/workflows/ci.yml | 25 +++++++++++++++---- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/.github/actions/dns-spoof-ubuntu-archive/action.yml b/.github/actions/dns-spoof-ubuntu-archive/action.yml index 33fea7d44..bba288373 100644 --- a/.github/actions/dns-spoof-ubuntu-archive/action.yml +++ b/.github/actions/dns-spoof-ubuntu-archive/action.yml @@ -49,6 +49,14 @@ runs: sudo mkdir -p /etc/docker jq --arg dns "${DNSMASQ_IP}" '.dns = [$dns]' "${tmp}" | sudo tee /etc/docker/daemon.json - sudo systemctl restart docker + sudo systemctl stop docker + if ! sudo systemctl start docker; then + sudo systemctl reset-failed docker + if ! sudo systemctl start docker; then + echo "::error::error restarting dockerd with custom DNS" + journalctl -u docker + exit 1 + fi + fi env: DNSMASQ_IP: ${{ steps.dnsmasq-config.outputs.DNSMASQ_IP }} diff --git a/.github/actions/enable-containerd/action.yml b/.github/actions/enable-containerd/action.yml index 5d9a2af68..2ab5e0b5f 100644 --- a/.github/actions/enable-containerd/action.yml +++ b/.github/actions/enable-containerd/action.yml @@ -20,4 +20,10 @@ runs: sudo cp "${tmp}" /etc/docker/daemon.json rm "${tmp}" - sudo systemctl restart docker \ No newline at end of file + sudo systemctl stop docker + if ! sudo systemctl start docker; then + echo "::error::error restarting dockerd with containerd enabled" + journalctl -u docker + ps aux | grep docker + exit 1 + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37ce14259..bd3c8696c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -137,6 +137,10 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Docker info + run: | + docker info + docker version - name: Setup otel-collector run: | set -e @@ -163,16 +167,27 @@ jobs: sudo cp "${tmp}" /etc/systemd/system/docker.service.d/otlp.conf sudo cp "${tmp}" /etc/systemd/system/containerd.service.d/otlp.conf + sudo systemctl stop docker.service docker.socket + sudo systemctl stop containerd + if ! sudo systemctl daemon-reload; then echo "::warning::Failed to reload systemd daemon for tracing configuration" fi - if ! sudo systemctl restart containerd; then - echo "::warning::Failed to restart containerd with tracing configuration" + + if ! sudo systemctl start containerd; then + echo "::error::Failed to restart containerd with tracing configuration" + journalctl -u containerd + exit 1 fi - if ! sudo systemctl restart docker; then - echo "::warning::Failed to restart docker with tracing configuration" + if ! sudo systemctl start docker; then + sudo systemctl reset-failed docker + if ! sudo systemctl start docker; then + echo "::error::Failed to restart docker with tracing configuration" + journalctl -u docker + ps aux | grep dockerd + exit 1 + fi fi - - name: download deps run: go mod download - name: Setup QEMU From 8409f8e62d81e06e7628dceb120ec3161defb6e7 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Fri, 10 Apr 2026 10:59:54 -0700 Subject: [PATCH 3/3] ci: collect dockerd pprof dumps on test timeout Add timeout signaling from test2json2gha to GITHUB_OUTPUT so subsequent CI steps can detect when tests timed out. On timeout, the dump logs step now collects goroutine stacks, a binary heap profile, and the dockerd binary from the runner for offline analysis with go tool pprof. Signed-off-by: Brian Goff --- .github/workflows/ci.yml | 23 +++++++++++++++++++---- cmd/test2json2gha/main.go | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd3c8696c..b004bbe55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -299,6 +299,7 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} - name: Run integration tests + id: run-tests run: | set -ex if [ -n "${TEST_SUITE}" ] && [ ! "${TEST_SUITE}" = "other" ]; then @@ -342,15 +343,29 @@ jobs: set -e dir="$(mktemp -d)" - f="${dir}/dockerd.log" - echo "DOCKERD_LOG_PATH=${f}" >> $GITHUB_OUTPUT - sudo journalctl -u docker > "${f}" + echo "DOCKERD_LOG_PATH=${dir}" >> $GITHUB_OUTPUT + + if [ "${{ steps.run-tests.outputs.test_timeout }}" = "true" ]; then + echo "::group::Collecting pprof data from dockerd (test timeout detected)" + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/goroutine-stacks.txt" \ + "http://localhost/debug/pprof/goroutine?debug=2" || true + + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/heap-profile.bin" \ + "http://localhost/debug/pprof/heap" || true + + cp "$(which dockerd)" "${dir}/dockerd" || true + echo "::endgroup::" + fi + + sudo journalctl -u docker > "${dir}/dockerd.log" - name: Upload buildkit logs if: failure() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: e2e-dockerd-logs-${{ matrix.suite }} - path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }} + path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}/* retention-days: 1 unit: diff --git a/cmd/test2json2gha/main.go b/cmd/test2json2gha/main.go index 3c81441bf..55d546de7 100644 --- a/cmd/test2json2gha/main.go +++ b/cmd/test2json2gha/main.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "io" + "iter" "log/slog" "os" "runtime/debug" @@ -73,6 +74,7 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { var wg waitGroup results.markUnfinishedAsTimeout() + signalTimeout(results.Results()) wg.Go(func() { var rf ResultsFormatter @@ -135,6 +137,30 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { return bool(anyFailed), nil } +// signalTimeout writes test_timeout=true to GITHUB_OUTPUT if any test timed out. +// This allows subsequent CI steps to detect that a timeout occurred. +func signalTimeout(results iter.Seq[*TestResult]) { + ghOutput := os.Getenv("GITHUB_OUTPUT") + if ghOutput == "" { + return + } + + for r := range results { + if r.timeout { + f, err := os.OpenFile(ghOutput, os.O_WRONLY|os.O_APPEND, 0) + if err != nil { + slog.Error("Error opening GITHUB_OUTPUT", "error", err) + return + } + if _, err := fmt.Fprintln(f, "test_timeout=true"); err != nil { + slog.Error("Error writing timeout status to GITHUB_OUTPUT", "error", err) + } + f.Close() + return + } + } +} + type waitGroup struct { sync.WaitGroup }