Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 100 additions & 14 deletions lib/iris/src/iris/cluster/providers/gcp/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,22 +145,65 @@ def replace_var(match: re.Match) -> str:
# Create cache directory
sudo mkdir -p {{ cache_dir }}

find_gcloud() {
if command -v gcloud > /dev/null 2>&1; then
command -v gcloud
return 0
fi
if [ -x /snap/bin/gcloud ]; then
echo /snap/bin/gcloud
return 0
fi
return 1
}

configure_docker_auth_with_retries() {
local AR_HOST="$1"
local gcloud_bin=""
for attempt in $(seq 1 60); do
gcloud_bin=$(find_gcloud || true)
if [ -z "$gcloud_bin" ]; then
echo "[iris-init] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"
sleep 2
continue
fi
echo "[iris-init] Configuring docker auth for $AR_HOST (attempt $attempt/60)"
if sudo "$gcloud_bin" auth configure-docker "$AR_HOST" -q; then
return 0
fi
echo "[iris-init] gcloud auth configure-docker failed for $AR_HOST (attempt $attempt/60)"
sleep 2
done
echo "[iris-init] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"
return 1
}

pull_docker_image_with_retries() {
local image="$1"
for attempt in $(seq 1 5); do
if sudo docker pull "$image"; then
return 0
fi
echo "[iris-init] Docker pull failed (attempt $attempt/5)"
if [ "$attempt" -lt 5 ]; then
sleep 5
fi
done
echo "[iris-init] ERROR: Failed to pull image after 5 attempts: $image"
return 1
}

echo "[iris-init] Phase: docker_pull"
echo "[iris-init] Pulling image: {{ docker_image }}"

# Configure Artifact Registry auth on demand.
# Must run under sudo because `sudo docker pull` uses root's docker config.
if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then
AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1)
echo "[iris-init] Configuring docker auth for $AR_HOST"
if command -v gcloud &> /dev/null; then
sudo gcloud auth configure-docker "$AR_HOST" -q || true
else
echo "[iris-init] Warning: gcloud not found; AR pull may fail without prior auth"
fi
configure_docker_auth_with_retries "$AR_HOST"
Comment thread
rjpower marked this conversation as resolved.
fi

sudo docker pull {{ docker_image }}
pull_docker_image_with_retries "{{ docker_image }}"

echo "[iris-init] Phase: config_setup"
sudo mkdir -p /etc/iris
Expand Down Expand Up @@ -315,22 +358,65 @@ def build_worker_bootstrap_script(
sudo sysctl -w net.ipv4.ip_local_port_range="1024 65535"
sudo sysctl -w net.ipv4.tcp_tw_reuse=1

find_gcloud() {
if command -v gcloud > /dev/null 2>&1; then
command -v gcloud
return 0
fi
if [ -x /snap/bin/gcloud ]; then
echo /snap/bin/gcloud
return 0
fi
return 1
}

configure_docker_auth_with_retries() {
local AR_HOST="$1"
local gcloud_bin=""
for attempt in $(seq 1 60); do
gcloud_bin=$(find_gcloud || true)
if [ -z "$gcloud_bin" ]; then
echo "[iris-controller] [3/5] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"
sleep 2
continue
fi
echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST (attempt $attempt/60)"
if sudo "$gcloud_bin" auth configure-docker "$AR_HOST" -q; then
return 0
fi
echo "[iris-controller] [3/5] gcloud auth configure-docker failed for $AR_HOST (attempt $attempt/60)"
sleep 2
done
echo "[iris-controller] [3/5] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"
return 1
}

pull_docker_image_with_retries() {
local image="$1"
for attempt in $(seq 1 5); do
if sudo docker pull "$image"; then
return 0
fi
echo "[iris-controller] [4/5] Docker pull failed (attempt $attempt/5)"
if [ "$attempt" -lt 5 ]; then
sleep 5
fi
done
echo "[iris-controller] [4/5] ERROR: Failed to pull image after 5 attempts: $image"
return 1
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: The three shell functions (find_gcloud, configure_docker_auth_with_retries, pull_docker_image_with_retries) are duplicated nearly verbatim between the worker and controller templates (~48 lines each), differing only in the log prefix ([iris-init] vs [iris-controller] [3/5]).

One option to reduce drift: extract a shared _DOCKER_HELPERS_TEMPLATE that takes {{ log_prefix }} as a parameter, then render it into both scripts. That way a future fix (e.g., changing retry counts or adding a new fallback path) only needs to happen once.

Not blocking — the duplication is tolerable for a targeted production fix, but worth cleaning up as a follow-up.


echo "[iris-controller] [3/5] Pulling image: {{ docker_image }}"
echo "[iris-controller] This may take several minutes for large images..."

# Configure Artifact Registry auth on demand.
# Must run under sudo because `sudo docker pull` uses root's docker config.
if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then
AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1)
echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST"
if command -v gcloud &> /dev/null; then
sudo gcloud auth configure-docker "$AR_HOST" -q || true
else
echo "[iris-controller] [3/5] Warning: gcloud not found; AR pull may fail without prior auth"
fi
configure_docker_auth_with_retries "$AR_HOST"
fi

if sudo docker pull {{ docker_image }}; then
if pull_docker_image_with_retries "{{ docker_image }}"; then
echo "[iris-controller] [4/5] Image pull complete"
else
echo "[iris-controller] [4/5] ERROR: Image pull failed"
Expand Down
19 changes: 17 additions & 2 deletions lib/iris/tests/cluster/providers/gcp/test_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@ def test_build_worker_bootstrap_script_configures_ar_auth() -> None:
script = build_worker_bootstrap_script(cfg)

assert f'if echo "{ar_image}" | grep -q -- "-docker.pkg.dev/"' in script
assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script
assert "if [ -x /snap/bin/gcloud ]; then" in script
assert 'echo "[iris-init] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"' in script
assert 'echo "[iris-init] Configuring docker auth for $AR_HOST (attempt $attempt/60)"' in script
assert 'echo "[iris-init] Docker pull failed (attempt $attempt/5)"' in script
assert 'echo "[iris-init] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"' in script
assert "sleep 2" in script
assert "sleep 5" in script
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: these tests assert on exact log message strings embedded in the generated script, which means any log wording change requires updating both the template and the test. This is a common trade-off for template-based tests and is fine here — just flagging that these assertions will be brittle if the log format evolves.

An alternative would be to assert on structural markers (e.g., "find_gcloud()" in script, "configure_docker_auth_with_retries" in script) rather than full log lines, but the current approach gives more confidence that the full retry machinery is present.



def test_build_worker_bootstrap_script_requires_controller_address() -> None:
Expand Down Expand Up @@ -154,7 +160,16 @@ def resolve_image(image: str, zone: str | None = None) -> str:
"Pulling image: europe-docker.pkg.dev/hai-gcp-models/ghcr-mirror/marin-community/iris-controller:latest"
in script
)
assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script
assert "if [ -x /snap/bin/gcloud ]; then" in script
assert (
'echo "[iris-controller] [3/5] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"'
in script
)
assert 'echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST (attempt $attempt/60)"' in script
assert 'echo "[iris-controller] [4/5] Docker pull failed (attempt $attempt/5)"' in script
assert (
'echo "[iris-controller] [3/5] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"' in script
)


# --- GcpWorkerProvider.resolve_image() tests ---
Expand Down
Loading