diff --git a/.github/actions/dns-spoof-ubuntu-archive/action.yml b/.github/actions/dns-spoof-ubuntu-archive/action.yml index 33fea7d44..bba288373 100644 --- a/.github/actions/dns-spoof-ubuntu-archive/action.yml +++ b/.github/actions/dns-spoof-ubuntu-archive/action.yml @@ -49,6 +49,14 @@ runs: sudo mkdir -p /etc/docker jq --arg dns "${DNSMASQ_IP}" '.dns = [$dns]' "${tmp}" | sudo tee /etc/docker/daemon.json - sudo systemctl restart docker + sudo systemctl stop docker + if ! sudo systemctl start docker; then + sudo systemctl reset-failed docker + if ! sudo systemctl start docker; then + echo "::error::error restarting dockerd with custom DNS" + journalctl -u docker + exit 1 + fi + fi env: DNSMASQ_IP: ${{ steps.dnsmasq-config.outputs.DNSMASQ_IP }} diff --git a/.github/actions/enable-containerd/action.yml b/.github/actions/enable-containerd/action.yml index 5d9a2af68..2ab5e0b5f 100644 --- a/.github/actions/enable-containerd/action.yml +++ b/.github/actions/enable-containerd/action.yml @@ -20,4 +20,10 @@ runs: sudo cp "${tmp}" /etc/docker/daemon.json rm "${tmp}" - sudo systemctl restart docker \ No newline at end of file + sudo systemctl stop docker + if ! sudo systemctl start docker; then + echo "::error::error restarting dockerd with containerd enabled" + journalctl -u docker + ps aux | grep docker + exit 1 + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ffffa3dc..b004bbe55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,7 +88,7 @@ jobs: fi integration: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.suite == 'other' && 'ubuntu-22.04' || 'ubuntu-latest-4-cores' }} strategy: fail-fast: false matrix: @@ -137,6 +137,10 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Docker info + run: | + docker info + docker version - name: Setup otel-collector run: | set -e @@ -163,16 +167,27 @@ jobs: sudo cp "${tmp}" /etc/systemd/system/docker.service.d/otlp.conf sudo cp "${tmp}" /etc/systemd/system/containerd.service.d/otlp.conf + sudo systemctl stop docker.service docker.socket + sudo systemctl stop containerd + if ! sudo systemctl daemon-reload; then echo "::warning::Failed to reload systemd daemon for tracing configuration" fi - if ! sudo systemctl restart containerd; then - echo "::warning::Failed to restart containerd with tracing configuration" + + if ! sudo systemctl start containerd; then + echo "::error::Failed to restart containerd with tracing configuration" + journalctl -u containerd + exit 1 fi - if ! sudo systemctl restart docker; then - echo "::warning::Failed to restart docker with tracing configuration" + if ! sudo systemctl start docker; then + sudo systemctl reset-failed docker + if ! sudo systemctl start docker; then + echo "::error::Failed to restart docker with tracing configuration" + journalctl -u docker + ps aux | grep dockerd + exit 1 + fi fi - - name: download deps run: go mod download - name: Setup QEMU @@ -284,6 +299,7 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} - name: Run integration tests + id: run-tests run: | set -ex if [ -n "${TEST_SUITE}" ] && [ ! "${TEST_SUITE}" = "other" ]; then @@ -327,15 +343,29 @@ jobs: set -e dir="$(mktemp -d)" - f="${dir}/dockerd.log" - echo "DOCKERD_LOG_PATH=${f}" >> $GITHUB_OUTPUT - sudo journalctl -u docker > "${f}" + echo "DOCKERD_LOG_PATH=${dir}" >> $GITHUB_OUTPUT + + if [ "${{ steps.run-tests.outputs.test_timeout }}" = "true" ]; then + echo "::group::Collecting pprof data from dockerd (test timeout detected)" + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/goroutine-stacks.txt" \ + "http://localhost/debug/pprof/goroutine?debug=2" || true + + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/heap-profile.bin" \ + "http://localhost/debug/pprof/heap" || true + + cp "$(which dockerd)" "${dir}/dockerd" || true + echo "::endgroup::" + fi + + sudo journalctl -u docker > "${dir}/dockerd.log" - name: Upload buildkit logs if: failure() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: e2e-dockerd-logs-${{ matrix.suite }} - path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }} + path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}/* retention-days: 1 unit: diff --git a/cmd/test2json2gha/main.go b/cmd/test2json2gha/main.go index 3c81441bf..55d546de7 100644 --- a/cmd/test2json2gha/main.go +++ b/cmd/test2json2gha/main.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "io" + "iter" "log/slog" "os" "runtime/debug" @@ -73,6 +74,7 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { var wg waitGroup results.markUnfinishedAsTimeout() + signalTimeout(results.Results()) wg.Go(func() { var rf ResultsFormatter @@ -135,6 +137,30 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { return bool(anyFailed), nil } +// signalTimeout writes test_timeout=true to GITHUB_OUTPUT if any test timed out. +// This allows subsequent CI steps to detect that a timeout occurred. +func signalTimeout(results iter.Seq[*TestResult]) { + ghOutput := os.Getenv("GITHUB_OUTPUT") + if ghOutput == "" { + return + } + + for r := range results { + if r.timeout { + f, err := os.OpenFile(ghOutput, os.O_WRONLY|os.O_APPEND, 0) + if err != nil { + slog.Error("Error opening GITHUB_OUTPUT", "error", err) + return + } + if _, err := fmt.Fprintln(f, "test_timeout=true"); err != nil { + slog.Error("Error writing timeout status to GITHUB_OUTPUT", "error", err) + } + f.Close() + return + } + } +} + type waitGroup struct { sync.WaitGroup }