Skip to content

Commit 3805728

Browse files
authored
fix: pre-pull and cache k3s node image to prevent E2E cluster creation failures (#287)
k3d cluster creation fails intermittently when Docker Hub is slow or rate-limited because the k3s node image (rancher/k3s) is not pre-pulled or cached. While cert-manager, Prometheus, stress-ng, and busybox images are all pre-pulled and cached as tarballs, the k3s image was left to k3d to pull on the fly during cluster creation. This caused all 3 retry attempts to fail with 'Client.Timeout exceeded while awaiting headers' in the nightly E2E run for K8s v1.34 (run 26858361803), while v1.32, v1.33, and v1.35 succeeded on runners with better connectivity. Changes: - Add k3s image to pull_and_save() in both e2e-nightly.yaml and the setup-e2e-cluster composite action - Add /tmp/k3s.tar to the actions/cache path and include the k3s version in the cache key - Add docker load before k3d cluster create so the image is available locally regardless of Docker Hub connectivity - Add retry logic (3 attempts with 10s backoff) to pull_and_save() for all images, not just k3s Signed-off-by: Sebastien Tardif <sebtardif@ncf.ca>
1 parent ec0db8d commit 3805728

2 files changed

Lines changed: 38 additions & 8 deletions

File tree

.github/actions/setup-e2e-cluster/action.yaml

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ runs:
5959
/tmp/cert-manager-cainjector.tar
6060
/tmp/prometheus.tar
6161
/tmp/stress-ng.tar
62-
key: e2e-images-${{ runner.os }}-cm${{ inputs.cert-manager-version }}-prom${{ inputs.prometheus-chart-version }}-stress0.20.01
62+
/tmp/k3s.tar
63+
key: e2e-images-${{ runner.os }}-cm${{ inputs.cert-manager-version }}-prom${{ inputs.prometheus-chart-version }}-stress0.20.01-k3s${{ inputs.k3s-image }}
6364

6465
- uses: ./.github/actions/install-binary-tool
6566
with:
@@ -87,24 +88,39 @@ runs:
8788
docker builder prune -f || true
8889
docker system prune -f || true
8990
90-
- name: Pre-pull cert-manager and Prometheus images
91+
- name: Pre-pull container images
9192
shell: bash -Eeuo pipefail -x {0}
9293
run: |
94+
# Pull images with docker (parallel layer downloads) and save as
95+
# tarballs for k3d import. Skip images already restored from cache.
96+
# Retries handle transient Docker Hub connectivity issues.
9397
pull_and_save() {
9498
local image="$1" tarball="$2"
9599
[[ -f "$tarball" ]] && { echo "Cached: $tarball"; return 0; }
96-
docker pull --platform linux/amd64 "$image"
97-
docker save "$image" -o "$tarball"
100+
for attempt in 1 2 3; do
101+
if docker pull --platform linux/amd64 "$image"; then
102+
docker save "$image" -o "$tarball"
103+
return 0
104+
fi
105+
echo "::warning::docker pull attempt $attempt for $image failed, retrying in 10s..."
106+
sleep 10
107+
done
108+
echo "::error::Failed to pull $image after 3 attempts"
109+
return 1
98110
}
99111
pull_and_save "quay.io/jetstack/cert-manager-controller:${{ inputs.cert-manager-version }}" /tmp/cert-manager-controller.tar
100112
pull_and_save "quay.io/jetstack/cert-manager-webhook:${{ inputs.cert-manager-version }}" /tmp/cert-manager-webhook.tar
101113
pull_and_save "quay.io/jetstack/cert-manager-cainjector:${{ inputs.cert-manager-version }}" /tmp/cert-manager-cainjector.tar
102114
pull_and_save "${{ inputs.prometheus-image }}" /tmp/prometheus.tar
103115
pull_and_save "${{ inputs.stress-ng-image }}" /tmp/stress-ng.tar
116+
pull_and_save "${{ inputs.k3s-image }}" /tmp/k3s.tar
104117
105118
- name: Create k3d cluster
106119
shell: bash -Eeuo pipefail -x {0}
107120
run: |
121+
# Load the pre-pulled k3s image into Docker so k3d uses it
122+
# without pulling from Docker Hub (avoids rate-limit/timeout failures).
123+
docker load -i /tmp/k3s.tar
108124
for attempt in 1 2 3; do
109125
if k3d cluster create "${{ inputs.cluster-name }}" \
110126
--image ${{ inputs.k3s-image }} \

.github/workflows/e2e-nightly.yaml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ jobs:
135135
/tmp/prometheus.tar
136136
/tmp/stress-ng.tar
137137
/tmp/busybox.tar
138-
key: e2e-images-${{ runner.os }}-cm${{ env.CERT_MANAGER_VERSION }}-prom${{ env.PROMETHEUS_CHART_VERSION }}-stress0.20.01-busybox1.37
138+
/tmp/k3s.tar
139+
key: e2e-images-${{ runner.os }}-cm${{ env.CERT_MANAGER_VERSION }}-prom${{ env.PROMETHEUS_CHART_VERSION }}-stress0.20.01-busybox1.37-k3s${{ matrix.k3s-image }}
139140

140141
- uses: ./.github/actions/install-binary-tool
141142
with:
@@ -163,28 +164,41 @@ jobs:
163164
docker builder prune -f || true
164165
docker system prune -f || true
165166
166-
- name: Pre-pull cert-manager and Prometheus images
167+
- name: Pre-pull container images
167168
shell: bash -Eeuo pipefail -x {0}
168169
run: |
169170
# Pull images with docker (parallel layer downloads) and save as
170171
# tarballs for k3d import. Skip images already restored from cache.
172+
# Retries handle transient Docker Hub connectivity issues.
171173
pull_and_save() {
172174
local image="$1" tarball="$2"
173175
[[ -f "$tarball" ]] && { echo "Cached: $tarball"; return 0; }
174-
docker pull --platform linux/amd64 "$image"
175-
docker save "$image" -o "$tarball"
176+
for attempt in 1 2 3; do
177+
if docker pull --platform linux/amd64 "$image"; then
178+
docker save "$image" -o "$tarball"
179+
return 0
180+
fi
181+
echo "::warning::docker pull attempt $attempt for $image failed, retrying in 10s..."
182+
sleep 10
183+
done
184+
echo "::error::Failed to pull $image after 3 attempts"
185+
return 1
176186
}
177187
pull_and_save "quay.io/jetstack/cert-manager-controller:${{ env.CERT_MANAGER_VERSION }}" /tmp/cert-manager-controller.tar
178188
pull_and_save "quay.io/jetstack/cert-manager-webhook:${{ env.CERT_MANAGER_VERSION }}" /tmp/cert-manager-webhook.tar
179189
pull_and_save "quay.io/jetstack/cert-manager-cainjector:${{ env.CERT_MANAGER_VERSION }}" /tmp/cert-manager-cainjector.tar
180190
pull_and_save "${{ env.PROMETHEUS_IMAGE }}" /tmp/prometheus.tar
181191
pull_and_save "${{ env.STRESS_NG_IMAGE }}" /tmp/stress-ng.tar
182192
pull_and_save "${{ env.CPU_BURN_IMAGE }}" /tmp/busybox.tar
193+
pull_and_save "rancher/k3s:${{ matrix.k3s-image }}" /tmp/k3s.tar
183194
184195
- name: Create k3d cluster
185196
shell: bash -Eeuo pipefail -x {0}
186197
run: |
187198
rm -f "$KUBECONFIG"
199+
# Load the pre-pulled k3s image into Docker so k3d uses it
200+
# without pulling from Docker Hub (avoids rate-limit/timeout failures).
201+
docker load -i /tmp/k3s.tar
188202
# Retry cluster creation up to 3 times. k3d can fail transiently
189203
# when Docker containers vanish mid-creation due to daemon
190204
# contention with concurrent CI runs.

0 commit comments

Comments
 (0)