diff --git a/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py b/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py index 08f2dc233a..7f4c88b7f2 100644 --- a/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py +++ b/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py @@ -108,6 +108,72 @@ def replace_var(match: re.Match) -> str: return result +# ============================================================================ +# Shared shell helpers +# ============================================================================ + +# Bash helpers shared by worker and controller bootstrap scripts. +# +# The race we're papering over: `gcloud` is provided by a snap/apt package that +# may not be on PATH yet when the startup script runs, and AR auth/docker pull +# are both prone to transient failures during image churn. Each bootstrap +# renders this block with its own ``log_prefix`` so output stays tagged with +# the caller's conventions. +DOCKER_HELPERS_TEMPLATE = """ +find_gcloud() { + if command -v gcloud > /dev/null 2>&1; then + command -v gcloud + return 0 + fi + if [ -x /snap/bin/gcloud ]; then + echo /snap/bin/gcloud + return 0 + fi + return 1 +} + +configure_docker_auth_with_retries() { + local AR_HOST="$1" + local gcloud_bin="" + for attempt in $(seq 1 60); do + gcloud_bin=$(find_gcloud || true) + if [ -z "$gcloud_bin" ]; then + echo "{{ log_prefix }} Waiting for gcloud before configuring docker auth (attempt $attempt/60)" + sleep 2 + continue + fi + echo "{{ log_prefix }} Configuring docker auth for $AR_HOST (attempt $attempt/60)" + if sudo "$gcloud_bin" auth configure-docker "$AR_HOST" -q; then + return 0 + fi + echo "{{ log_prefix }} gcloud auth configure-docker failed for $AR_HOST (attempt $attempt/60)" + sleep 2 + done + echo "{{ log_prefix }} ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts" + return 1 +} + +pull_docker_image_with_retries() { + local image="$1" + for attempt in $(seq 1 5); do + if sudo docker pull "$image"; then + return 0 + fi + echo "{{ log_prefix }} Docker pull failed (attempt $attempt/5)" + if [ "$attempt" -lt 5 ]; then + sleep 5 + fi + done + echo "{{ log_prefix }} ERROR: Failed to pull image after 5 attempts: $image" + return 1 +} +""" + + +def _render_docker_helpers(log_prefix: str) -> str: + return render_template(DOCKER_HELPERS_TEMPLATE, log_prefix=log_prefix) + + # ============================================================================ # Worker Bootstrap Script # ============================================================================ @@ -144,7 +210,7 @@ def replace_var(match: re.Match) -> str: # Create cache directory sudo mkdir -p {{ cache_dir }} - +{{ docker_helpers }} echo "[iris-init] Phase: docker_pull" echo "[iris-init] Pulling image: {{ docker_image }}" @@ -152,15 +218,10 @@ def replace_var(match: re.Match) -> str: # Must run under sudo because `sudo docker pull` uses root's docker config. if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1) - echo "[iris-init] Configuring docker auth for $AR_HOST" - if command -v gcloud &> /dev/null; then - sudo gcloud auth configure-docker "$AR_HOST" -q || true - else - echo "[iris-init] Warning: gcloud not found; AR pull may fail without prior auth" - fi + configure_docker_auth_with_retries "$AR_HOST" fi -sudo docker pull {{ docker_image }} +pull_docker_image_with_retries "{{ docker_image }}" echo "[iris-init] Phase: config_setup" sudo mkdir -p /etc/iris @@ -251,6 +312,7 @@ def build_worker_bootstrap_script( docker_image=worker_config.docker_image, worker_port=worker_config.port, worker_config_json=worker_config_json, + docker_helpers=_render_docker_helpers("[iris-init]"), ) @@ -314,7 +376,7 @@ def build_worker_bootstrap_script( # Tune network stack for high-connection workloads (#3066). sudo sysctl -w net.ipv4.ip_local_port_range="1024 65535" sudo sysctl -w net.ipv4.tcp_tw_reuse=1 - +{{ docker_helpers }} echo "[iris-controller] [3/5] Pulling image: {{ docker_image }}" echo "[iris-controller] This may take several minutes for large images..." @@ -322,15 +384,10 @@ def build_worker_bootstrap_script( # Must run under sudo because `sudo docker pull` uses root's docker config. if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1) - echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST" - if command -v gcloud &> /dev/null; then - sudo gcloud auth configure-docker "$AR_HOST" -q || true - else - echo "[iris-controller] [3/5] Warning: gcloud not found; AR pull may fail without prior auth" - fi + configure_docker_auth_with_retries "$AR_HOST" fi -if sudo docker pull {{ docker_image }}; then +if pull_docker_image_with_retries "{{ docker_image }}"; then echo "[iris-controller] [4/5] Image pull complete" else echo "[iris-controller] [4/5] ERROR: Image pull failed" @@ -463,6 +520,7 @@ def build_controller_bootstrap_script( config_volume=config_volume, config_flag=config_flag, fresh_flag="--fresh" if fresh else "", + docker_helpers=_render_docker_helpers("[iris-controller]"), ) diff --git a/lib/iris/tests/cluster/providers/gcp/test_bootstrap.py b/lib/iris/tests/cluster/providers/gcp/test_bootstrap.py index 719fa0fa7c..3a52feee60 100644 --- a/lib/iris/tests/cluster/providers/gcp/test_bootstrap.py +++ b/lib/iris/tests/cluster/providers/gcp/test_bootstrap.py @@ -43,8 +43,9 @@ def test_build_worker_bootstrap_script_configures_ar_auth() -> None: script = build_worker_bootstrap_script(cfg) + # AR images must trigger the auth path before the docker pull. assert f'if echo "{ar_image}" | grep -q -- "-docker.pkg.dev/"' in script - assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script + assert 'configure_docker_auth_with_retries "$AR_HOST"' in script def test_build_worker_bootstrap_script_requires_controller_address() -> None: @@ -154,7 +155,6 @@ def resolve_image(image: str, zone: str | None = None) -> str: "Pulling image: europe-docker.pkg.dev/hai-gcp-models/ghcr-mirror/marin-community/iris-controller:latest" in script ) - assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script # --- GcpWorkerProvider.resolve_image() tests ---