-
Notifications
You must be signed in to change notification settings - Fork 109
[iris] Fix GCP bootstrap gcloud race #4813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,22 +145,65 @@ def replace_var(match: re.Match) -> str: | |
| # Create cache directory | ||
| sudo mkdir -p {{ cache_dir }} | ||
|
|
||
| find_gcloud() { | ||
| if command -v gcloud > /dev/null 2>&1; then | ||
| command -v gcloud | ||
| return 0 | ||
| fi | ||
| if [ -x /snap/bin/gcloud ]; then | ||
| echo /snap/bin/gcloud | ||
| return 0 | ||
| fi | ||
| return 1 | ||
| } | ||
|
|
||
| configure_docker_auth_with_retries() { | ||
| local AR_HOST="$1" | ||
| local gcloud_bin="" | ||
| for attempt in $(seq 1 60); do | ||
| gcloud_bin=$(find_gcloud || true) | ||
| if [ -z "$gcloud_bin" ]; then | ||
| echo "[iris-init] Waiting for gcloud before configuring docker auth (attempt $attempt/60)" | ||
| sleep 2 | ||
| continue | ||
| fi | ||
| echo "[iris-init] Configuring docker auth for $AR_HOST (attempt $attempt/60)" | ||
| if sudo "$gcloud_bin" auth configure-docker "$AR_HOST" -q; then | ||
| return 0 | ||
| fi | ||
| echo "[iris-init] gcloud auth configure-docker failed for $AR_HOST (attempt $attempt/60)" | ||
| sleep 2 | ||
| done | ||
| echo "[iris-init] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts" | ||
| return 1 | ||
| } | ||
|
|
||
| pull_docker_image_with_retries() { | ||
| local image="$1" | ||
| for attempt in $(seq 1 5); do | ||
| if sudo docker pull "$image"; then | ||
| return 0 | ||
| fi | ||
| echo "[iris-init] Docker pull failed (attempt $attempt/5)" | ||
| if [ "$attempt" -lt 5 ]; then | ||
| sleep 5 | ||
| fi | ||
| done | ||
| echo "[iris-init] ERROR: Failed to pull image after 5 attempts: $image" | ||
| return 1 | ||
| } | ||
|
|
||
| echo "[iris-init] Phase: docker_pull" | ||
| echo "[iris-init] Pulling image: {{ docker_image }}" | ||
|
|
||
| # Configure Artifact Registry auth on demand. | ||
| # Must run under sudo because `sudo docker pull` uses root's docker config. | ||
| if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then | ||
| AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1) | ||
| echo "[iris-init] Configuring docker auth for $AR_HOST" | ||
| if command -v gcloud &> /dev/null; then | ||
| sudo gcloud auth configure-docker "$AR_HOST" -q || true | ||
| else | ||
| echo "[iris-init] Warning: gcloud not found; AR pull may fail without prior auth" | ||
| fi | ||
| configure_docker_auth_with_retries "$AR_HOST" | ||
| fi | ||
|
|
||
| sudo docker pull {{ docker_image }} | ||
| pull_docker_image_with_retries "{{ docker_image }}" | ||
|
|
||
| echo "[iris-init] Phase: config_setup" | ||
| sudo mkdir -p /etc/iris | ||
|
|
@@ -315,22 +358,65 @@ def build_worker_bootstrap_script( | |
| sudo sysctl -w net.ipv4.ip_local_port_range="1024 65535" | ||
| sudo sysctl -w net.ipv4.tcp_tw_reuse=1 | ||
|
|
||
| find_gcloud() { | ||
| if command -v gcloud > /dev/null 2>&1; then | ||
| command -v gcloud | ||
| return 0 | ||
| fi | ||
| if [ -x /snap/bin/gcloud ]; then | ||
| echo /snap/bin/gcloud | ||
| return 0 | ||
| fi | ||
| return 1 | ||
| } | ||
|
|
||
| configure_docker_auth_with_retries() { | ||
| local AR_HOST="$1" | ||
| local gcloud_bin="" | ||
| for attempt in $(seq 1 60); do | ||
| gcloud_bin=$(find_gcloud || true) | ||
| if [ -z "$gcloud_bin" ]; then | ||
| echo "[iris-controller] [3/5] Waiting for gcloud before configuring docker auth (attempt $attempt/60)" | ||
| sleep 2 | ||
| continue | ||
| fi | ||
| echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST (attempt $attempt/60)" | ||
| if sudo "$gcloud_bin" auth configure-docker "$AR_HOST" -q; then | ||
| return 0 | ||
| fi | ||
| echo "[iris-controller] [3/5] gcloud auth configure-docker failed for $AR_HOST (attempt $attempt/60)" | ||
| sleep 2 | ||
| done | ||
| echo "[iris-controller] [3/5] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts" | ||
| return 1 | ||
| } | ||
|
|
||
| pull_docker_image_with_retries() { | ||
| local image="$1" | ||
| for attempt in $(seq 1 5); do | ||
| if sudo docker pull "$image"; then | ||
| return 0 | ||
| fi | ||
| echo "[iris-controller] [4/5] Docker pull failed (attempt $attempt/5)" | ||
| if [ "$attempt" -lt 5 ]; then | ||
| sleep 5 | ||
| fi | ||
| done | ||
| echo "[iris-controller] [4/5] ERROR: Failed to pull image after 5 attempts: $image" | ||
| return 1 | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: The three shell functions ( One option to reduce drift: extract a shared Not blocking — the duplication is tolerable for a targeted production fix, but worth cleaning up as a follow-up. |
||
|
|
||
| echo "[iris-controller] [3/5] Pulling image: {{ docker_image }}" | ||
| echo "[iris-controller] This may take several minutes for large images..." | ||
|
|
||
| # Configure Artifact Registry auth on demand. | ||
| # Must run under sudo because `sudo docker pull` uses root's docker config. | ||
| if echo "{{ docker_image }}" | grep -q -- "-docker.pkg.dev/"; then | ||
| AR_HOST=$(echo "{{ docker_image }}" | cut -d/ -f1) | ||
| echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST" | ||
| if command -v gcloud &> /dev/null; then | ||
| sudo gcloud auth configure-docker "$AR_HOST" -q || true | ||
| else | ||
| echo "[iris-controller] [3/5] Warning: gcloud not found; AR pull may fail without prior auth" | ||
| fi | ||
| configure_docker_auth_with_retries "$AR_HOST" | ||
| fi | ||
|
|
||
| if sudo docker pull {{ docker_image }}; then | ||
| if pull_docker_image_with_retries "{{ docker_image }}"; then | ||
| echo "[iris-controller] [4/5] Image pull complete" | ||
| else | ||
| echo "[iris-controller] [4/5] ERROR: Image pull failed" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,13 @@ def test_build_worker_bootstrap_script_configures_ar_auth() -> None: | |
| script = build_worker_bootstrap_script(cfg) | ||
|
|
||
| assert f'if echo "{ar_image}" | grep -q -- "-docker.pkg.dev/"' in script | ||
| assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script | ||
| assert "if [ -x /snap/bin/gcloud ]; then" in script | ||
| assert 'echo "[iris-init] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"' in script | ||
| assert 'echo "[iris-init] Configuring docker auth for $AR_HOST (attempt $attempt/60)"' in script | ||
| assert 'echo "[iris-init] Docker pull failed (attempt $attempt/5)"' in script | ||
| assert 'echo "[iris-init] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"' in script | ||
| assert "sleep 2" in script | ||
| assert "sleep 5" in script | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: these tests assert on exact log message strings embedded in the generated script, which means any log wording change requires updating both the template and the test. This is a common trade-off for template-based tests and is fine here — just flagging that these assertions will be brittle if the log format evolves. An alternative would be to assert on structural markers (e.g., |
||
|
|
||
|
|
||
| def test_build_worker_bootstrap_script_requires_controller_address() -> None: | ||
|
|
@@ -154,7 +160,16 @@ def resolve_image(image: str, zone: str | None = None) -> str: | |
| "Pulling image: europe-docker.pkg.dev/hai-gcp-models/ghcr-mirror/marin-community/iris-controller:latest" | ||
| in script | ||
| ) | ||
| assert 'sudo gcloud auth configure-docker "$AR_HOST" -q || true' in script | ||
| assert "if [ -x /snap/bin/gcloud ]; then" in script | ||
| assert ( | ||
| 'echo "[iris-controller] [3/5] Waiting for gcloud before configuring docker auth (attempt $attempt/60)"' | ||
| in script | ||
| ) | ||
| assert 'echo "[iris-controller] [3/5] Configuring docker auth for $AR_HOST (attempt $attempt/60)"' in script | ||
| assert 'echo "[iris-controller] [4/5] Docker pull failed (attempt $attempt/5)"' in script | ||
| assert ( | ||
| 'echo "[iris-controller] [3/5] ERROR: Failed to configure docker auth for $AR_HOST after 60 attempts"' in script | ||
| ) | ||
|
|
||
|
|
||
| # --- GcpWorkerProvider.resolve_image() tests --- | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.