Skip to content

Commit 8409f8e

Browse files
committed
ci: collect dockerd pprof dumps on test timeout
Add timeout signaling from test2json2gha to GITHUB_OUTPUT so subsequent CI steps can detect when tests timed out. On timeout, the dump logs step now collects goroutine stacks, a binary heap profile, and the dockerd binary from the runner for offline analysis with go tool pprof. Signed-off-by: Brian Goff <cpuguy83@gmail.com>
1 parent 86fbcc1 commit 8409f8e

File tree

2 files changed

+45
-4
lines changed

2 files changed

+45
-4
lines changed

.github/workflows/ci.yml

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ jobs:
299299
env:
300300
TEST_SUITE: ${{ matrix.suite }}
301301
- name: Run integration tests
302+
id: run-tests
302303
run: |
303304
set -ex
304305
if [ -n "${TEST_SUITE}" ] && [ ! "${TEST_SUITE}" = "other" ]; then
@@ -342,15 +343,29 @@ jobs:
342343
set -e
343344
344345
dir="$(mktemp -d)"
345-
f="${dir}/dockerd.log"
346-
echo "DOCKERD_LOG_PATH=${f}" >> $GITHUB_OUTPUT
347-
sudo journalctl -u docker > "${f}"
346+
echo "DOCKERD_LOG_PATH=${dir}" >> $GITHUB_OUTPUT
347+
348+
if [ "${{ steps.run-tests.outputs.test_timeout }}" = "true" ]; then
349+
echo "::group::Collecting pprof data from dockerd (test timeout detected)"
350+
curl --unix-socket /var/run/docker.sock \
351+
-o "${dir}/goroutine-stacks.txt" \
352+
"http://localhost/debug/pprof/goroutine?debug=2" || true
353+
354+
curl --unix-socket /var/run/docker.sock \
355+
-o "${dir}/heap-profile.bin" \
356+
"http://localhost/debug/pprof/heap" || true
357+
358+
cp "$(which dockerd)" "${dir}/dockerd" || true
359+
echo "::endgroup::"
360+
fi
361+
362+
sudo journalctl -u docker > "${dir}/dockerd.log"
348363
- name: Upload buildkit logs
349364
if: failure()
350365
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
351366
with:
352367
name: e2e-dockerd-logs-${{ matrix.suite }}
353-
path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}
368+
path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}/*
354369
retention-days: 1
355370

356371
unit:

cmd/test2json2gha/main.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"flag"
66
"fmt"
77
"io"
8+
"iter"
89
"log/slog"
910
"os"
1011
"runtime/debug"
@@ -73,6 +74,7 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) {
7374
var wg waitGroup
7475

7576
results.markUnfinishedAsTimeout()
77+
signalTimeout(results.Results())
7678

7779
wg.Go(func() {
7880
var rf ResultsFormatter
@@ -135,6 +137,30 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) {
135137
return bool(anyFailed), nil
136138
}
137139

140+
// signalTimeout writes test_timeout=true to GITHUB_OUTPUT if any test timed out.
141+
// This allows subsequent CI steps to detect that a timeout occurred.
142+
func signalTimeout(results iter.Seq[*TestResult]) {
143+
ghOutput := os.Getenv("GITHUB_OUTPUT")
144+
if ghOutput == "" {
145+
return
146+
}
147+
148+
for r := range results {
149+
if r.timeout {
150+
f, err := os.OpenFile(ghOutput, os.O_WRONLY|os.O_APPEND, 0)
151+
if err != nil {
152+
slog.Error("Error opening GITHUB_OUTPUT", "error", err)
153+
return
154+
}
155+
if _, err := fmt.Fprintln(f, "test_timeout=true"); err != nil {
156+
slog.Error("Error writing timeout status to GITHUB_OUTPUT", "error", err)
157+
}
158+
f.Close()
159+
return
160+
}
161+
}
162+
}
163+
138164
type waitGroup struct {
139165
sync.WaitGroup
140166
}

0 commit comments

Comments
 (0)