Skip to content

Commit 6c698b8

Browse files
fix: Docker push retries (ai-dynamo#6456)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
1 parent 7a6283e commit 6c698b8

File tree

4 files changed

+40
-7
lines changed

4 files changed

+40
-7
lines changed

.github/actions/docker-tag-push/action.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,19 @@ runs:
5151
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
5252
run: |
5353
set -euo pipefail
54-
if [[ ${CONDITIONAL_TAG} != '' ]]; then
55-
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${CONDITIONAL_TAG}
56-
docker push ${ECR_HOSTNAME}/${CONDITIONAL_TAG}
54+
source "${{ github.action_path }}/retry_push.sh"
55+
56+
if [[ -n "${CONDITIONAL_TAG}" ]]; then
57+
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${CONDITIONAL_TAG}"
58+
retry_push "${ECR_HOSTNAME}/${CONDITIONAL_TAG}"
5759
fi
5860
while IFS= read -r TAG; do
5961
if [ -z "$TAG" ]; then
6062
continue
6163
fi
6264
echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
6365
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
64-
docker push "${ECR_HOSTNAME}/${TAG}"
66+
retry_push "${ECR_HOSTNAME}/${TAG}"
6567
done <<< "$PUSH_TAGS"
6668
- name: ACR Tag and Push
6769
shell: bash
@@ -72,11 +74,13 @@ runs:
7274
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
7375
run: |
7476
set -euo pipefail
77+
source "${{ github.action_path }}/retry_push.sh"
78+
7579
while IFS= read -r TAG; do
7680
if [ -z "$TAG" ]; then
7781
continue
7882
fi
7983
echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
8084
docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
81-
docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
85+
retry_push "${AZURE_ACR_HOSTNAME}/${TAG}"
8286
done <<< "$PUSH_TAGS"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Retry docker push with exponential backoff.
2+
# Safe under `set -e`: the `if` conditional context prevents a failed
3+
# `docker push` from triggering an immediate exit.
4+
retry_push() {
5+
local image="$1"
6+
local max_attempts=3
7+
local wait_seconds=10
8+
local attempt=1
9+
10+
while true; do
11+
if docker push "$image"; then
12+
return 0
13+
fi
14+
echo "Push failed for $image (attempt ${attempt}/${max_attempts})." >&2
15+
16+
if (( attempt >= max_attempts )); then
17+
echo "Push failed after ${max_attempts} attempts: $image" >&2
18+
return 1
19+
fi
20+
21+
echo "Retrying in ${wait_seconds}s..."
22+
sleep "$wait_seconds"
23+
attempt=$((attempt + 1))
24+
wait_seconds=$((wait_seconds * 2))
25+
if (( wait_seconds > 120 )); then
26+
wait_seconds=120
27+
fi
28+
done
29+
}

.github/actions/skopeo-copy/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ runs:
109109
RETRY_DELAY=10
110110
for attempt in $(seq 1 $MAX_RETRIES); do
111111
echo "Attempt ${attempt}/${MAX_RETRIES}..."
112-
if skopeo copy --all "${SOURCE_REF}" "${TARGET_REF}"; then
112+
if skopeo copy --all --retry-times 3 "${SOURCE_REF}" "${TARGET_REF}"; then
113113
echo "target_image_ref=${{ inputs.target_registry }}/${TARGET_IMAGE}:${TARGET_TAG}" >> $GITHUB_OUTPUT
114114
echo "✅ Image copied successfully"
115115
exit 0

.github/workflows/build-test-distribute-flavor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ on:
9090
description: 'Timeout in minutes for the copy to ACR step'
9191
required: false
9292
type: number
93-
default: 5
93+
default: 10
9494
secrets:
9595
AWS_DEFAULT_REGION:
9696
required: true

0 commit comments

Comments
 (0)