Skip to content

Commit 079e8cd

Browse files
authored
[iris] Fix VM bootstrap monitor killing healthy workers (#4180)
- Split the shared 600s deadline in `_run_vm_slice_bootstrap` into independent phase 1 (cloud READY, 600s) and phase 2 (bootstrap complete, 300s) timeouts - Add HTTP `/health` probing as the primary bootstrap completion signal instead of relying solely on GCE serial port text matching ("Bootstrap complete") - Serial port polling retained for diagnostics and error detection ## Root cause A worker VM registered at 17:39:12 and was serving tasks, but the bootstrap monitor's shared 600s deadline expired at 17:47:33 because the "Bootstrap complete" line never appeared in GCE serial port output (likely a flush/buffering issue). This killed a healthy worker and its running task. The serial port approach is inherently fragile: the GCE serial port API uses byte offsets but the poller tracked Python character offsets (`len(output)`), and the output depends on the metadata_script_runner flushing before `exit 0`.
1 parent 1e301c8 commit 079e8cd

3 files changed

Lines changed: 242 additions & 23 deletions

File tree

lib/iris/src/iris/cluster/providers/gcp/fake.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ def __init__(
104104
self._label_prefix = label_prefix
105105
self._iris_labels = Labels(label_prefix) if mode == ServiceMode.LOCAL else None
106106

107+
# Serial port output injection for testing bootstrap monitoring
108+
self._serial_port_output: dict[tuple[str, str], str] = {}
109+
107110
# LOCAL mode: track spawned workers per slice for cleanup
108111
self._local_slices: dict[str, LocalSliceHandle] = {}
109112

@@ -321,10 +324,14 @@ def vm_set_metadata(self, name: str, zone: str, metadata: dict[str, str]) -> Non
321324
raise InfraError(f"VM {name!r} not found in zone {zone!r}")
322325
vm.metadata.update(metadata)
323326

327+
def set_serial_port_output(self, name: str, zone: str, output: str) -> None:
328+
"""Inject serial port output for a VM. Used by tests to simulate GCE serial console."""
329+
self._serial_port_output[(name, zone)] = output
330+
324331
def vm_get_serial_port_output(self, name: str, zone: str, start: int = 0) -> str:
325332
self._check_injected_failure("vm_get_serial_port_output")
326-
# DRY_RUN / LOCAL: return empty string (no serial console)
327-
return ""
333+
full_output = self._serial_port_output.get((name, zone), "")
334+
return full_output[start:]
328335

329336
# ========================================================================
330337
# LOCAL mode: worker spawning

lib/iris/src/iris/cluster/providers/gcp/workers.py

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -640,24 +640,37 @@ def _fetch_bootstrap_logs(project_id: str, handle: GcpSliceHandle) -> None:
640640
)
641641

642642

643+
def _probe_worker_health(address: str, port: int) -> bool:
644+
"""Probe the worker's HTTP health endpoint. Returns True if healthy."""
645+
try:
646+
resp = urllib.request.urlopen(f"http://{address}:{port}/health", timeout=5)
647+
return resp.status == 200
648+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError):
649+
return False
650+
651+
643652
def _run_vm_slice_bootstrap(
644653
gcp_service: GcpService,
645654
handle: GcpVmSliceHandle,
646655
worker_config: config_pb2.WorkerConfig,
647656
poll_interval: float = 5.0,
648657
cloud_ready_timeout: float = 600.0,
658+
bootstrap_timeout: float = 300.0,
649659
) -> None:
650-
"""Monitor GCE startup-script bootstrap via serial port output.
660+
"""Monitor GCE startup-script bootstrap via health probe and serial port.
661+
662+
Phase 1: Wait for the VM to reach cloud READY with an internal IP.
663+
Phase 2: Poll the worker's /health endpoint (primary signal) and serial
664+
port output (secondary signal / diagnostics) until bootstrap completes.
651665
652-
The bootstrap script was baked into VM metadata at creation time, so the
653-
VM self-bootstraps on first boot. This method polls serial port output
654-
for [iris-init] log lines until the script emits ``Bootstrap complete``
655-
or the timeout expires.
666+
Each phase has its own independent timeout to prevent a slow phase 1
667+
from starving phase 2.
656668
"""
657-
deadline = Deadline.from_now(Duration.from_seconds(cloud_ready_timeout))
669+
cloud_deadline = Deadline.from_now(Duration.from_seconds(cloud_ready_timeout))
658670
poll_duration = Duration.from_seconds(poll_interval)
659671

660-
while not deadline.expired():
672+
# Phase 1: wait for cloud READY with an internal IP
673+
while not cloud_deadline.expired():
661674
cloud_status = handle._describe_cloud()
662675
if cloud_status.state in (CloudSliceState.FAILED, CloudSliceState.DELETING):
663676
raise InfraError(f"VM slice {handle.slice_id} entered {cloud_status.state} while waiting for cloud READY")
@@ -668,32 +681,45 @@ def _run_vm_slice_bootstrap(
668681
else:
669682
raise InfraError(f"VM slice {handle.slice_id} did not reach cloud READY within {cloud_ready_timeout}s")
670683

684+
worker_address = cloud_status.workers[0].internal_address
685+
worker_port = worker_config.port
686+
687+
# Phase 2: poll health endpoint + serial port with a fresh deadline
688+
bootstrap_deadline = Deadline.from_now(Duration.from_seconds(bootstrap_timeout))
671689
serial_offset = 0
672-
bootstrap_complete = False
673-
bootstrap_failed = False
674690

675-
while not deadline.expired():
691+
while not bootstrap_deadline.expired():
692+
# Primary signal: HTTP health probe
693+
if _probe_worker_health(worker_address, worker_port):
694+
logger.info("Worker health probe succeeded for VM slice %s", handle.slice_id)
695+
break
696+
697+
# Secondary signal: serial port for diagnostics and error detection
676698
output = gcp_service.vm_get_serial_port_output(handle._vm_name, handle._zone, start=serial_offset)
677699
if output:
700+
serial_offset += len(output)
701+
bootstrap_complete = False
702+
bootstrap_errored = False
678703
for line in output.splitlines():
679704
if "[iris-init]" in line:
680705
logger.info("[%s serial] %s", handle.slice_id, line.strip())
681706
if "Bootstrap complete" in line:
682707
bootstrap_complete = True
683708
if "[iris-init] ERROR" in line:
684-
bootstrap_failed = True
685-
686-
serial_offset += len(output)
709+
bootstrap_errored = True
687710

688-
if bootstrap_complete:
689-
break
690-
if bootstrap_failed:
691-
raise InfraError(f"Startup-script bootstrap failed for VM slice {handle.slice_id} (see serial output above)")
711+
if bootstrap_complete:
712+
logger.info("Serial port reports bootstrap complete for VM slice %s", handle.slice_id)
713+
break
714+
if bootstrap_errored:
715+
raise InfraError(
716+
f"Startup-script bootstrap failed for VM slice {handle.slice_id} (see serial output above)"
717+
)
692718

693719
time.sleep(poll_duration.to_seconds())
694720
else:
695-
raise InfraError(f"VM slice {handle.slice_id} startup-script did not complete within {cloud_ready_timeout}s")
721+
raise InfraError(f"VM slice {handle.slice_id} bootstrap did not complete within {bootstrap_timeout}s")
696722

697-
logger.info("Bootstrap completed for VM slice %s (via startup-script)", handle.slice_id)
723+
logger.info("Bootstrap completed for VM slice %s", handle.slice_id)
698724
with handle._bootstrap_lock:
699725
handle._bootstrap_state = CloudSliceState.READY

lib/iris/tests/cluster/providers/gcp/test_platform.py

Lines changed: 188 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,22 @@
1717
import pytest
1818

1919
from iris.cluster.providers.gcp.fake import InMemoryGcpService
20-
from iris.cluster.providers.gcp.handles import _build_gce_resource_name
21-
from iris.cluster.providers.gcp.workers import GcpWorkerProvider, _validate_slice_config
20+
from iris.cluster.providers.gcp.handles import GcpVmSliceHandle, _build_gce_resource_name
21+
from iris.cluster.providers.gcp.workers import (
22+
GcpWorkerProvider,
23+
_run_vm_slice_bootstrap,
24+
_validate_slice_config,
25+
)
2226
from iris.cluster.providers.manual.provider import ManualControllerProvider, ManualWorkerProvider
2327
from iris.cluster.providers.types import (
2428
CloudSliceState,
29+
InfraError,
2530
Labels,
2631
QuotaExhaustedError,
2732
)
2833
from iris.cluster.service_mode import ServiceMode
2934
from iris.rpc import config_pb2
35+
from iris.time_utils import Timestamp
3036

3137
# =============================================================================
3238
# Fixture infrastructure
@@ -633,3 +639,183 @@ def test_gcp_tpu_slice_passes_startup_script_metadata():
633639
assert "startup-script" in metadata
634640
assert "[iris-init]" in metadata["startup-script"]
635641
assert "test-image:latest" in metadata["startup-script"]
642+
643+
644+
# =============================================================================
645+
# Section 6: VM Slice Bootstrap Tests
646+
#
647+
# Tests for _run_vm_slice_bootstrap with split timeouts and health probing.
648+
# =============================================================================
649+
650+
651+
def _make_vm_slice_for_bootstrap(
652+
gcp_service: InMemoryGcpService,
653+
zone: str = "us-central2-b",
654+
) -> tuple[GcpVmSliceHandle, str]:
655+
"""Create a VM in InMemoryGcpService and return a handle + vm_name for bootstrap testing."""
656+
from iris.cluster.providers.gcp.service import VmCreateRequest
657+
658+
vm_name = "test-bootstrap-vm"
659+
gcp_service.vm_create(
660+
VmCreateRequest(
661+
name=vm_name,
662+
zone=zone,
663+
machine_type="n2-standard-4",
664+
labels={Labels("iris").iris_slice_id: vm_name},
665+
)
666+
)
667+
handle = GcpVmSliceHandle(
668+
_slice_id=vm_name,
669+
_vm_name=vm_name,
670+
_zone=zone,
671+
_project_id="test-project",
672+
_gcp_service=gcp_service,
673+
_labels={Labels("iris").iris_slice_id: vm_name},
674+
_created_at=Timestamp.now(),
675+
_label_prefix="iris",
676+
_bootstrapping=True,
677+
)
678+
return handle, vm_name
679+
680+
681+
def test_vm_bootstrap_health_probe_succeeds_without_serial_port():
682+
"""Bootstrap completes when health probe succeeds, even if serial port never shows 'Bootstrap complete'."""
683+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
684+
handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service)
685+
worker_config = config_pb2.WorkerConfig(port=10001)
686+
687+
with unittest.mock.patch(
688+
"iris.cluster.providers.gcp.workers._probe_worker_health",
689+
return_value=True,
690+
):
691+
_run_vm_slice_bootstrap(
692+
gcp_service,
693+
handle,
694+
worker_config,
695+
poll_interval=0.01,
696+
cloud_ready_timeout=5.0,
697+
bootstrap_timeout=5.0,
698+
)
699+
700+
assert handle._bootstrap_state == CloudSliceState.READY
701+
702+
703+
def test_vm_bootstrap_serial_port_succeeds_without_health_probe():
704+
"""Bootstrap completes via serial port 'Bootstrap complete' when health probe fails."""
705+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
706+
handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service)
707+
worker_config = config_pb2.WorkerConfig(port=10001)
708+
709+
gcp_service.set_serial_port_output(
710+
vm_name,
711+
"us-central2-b",
712+
"[iris-init] Starting bootstrap\n[iris-init] Bootstrap complete\n",
713+
)
714+
715+
with unittest.mock.patch(
716+
"iris.cluster.providers.gcp.workers._probe_worker_health",
717+
return_value=False,
718+
):
719+
_run_vm_slice_bootstrap(
720+
gcp_service,
721+
handle,
722+
worker_config,
723+
poll_interval=0.01,
724+
cloud_ready_timeout=5.0,
725+
bootstrap_timeout=5.0,
726+
)
727+
728+
assert handle._bootstrap_state == CloudSliceState.READY
729+
730+
731+
def test_vm_bootstrap_serial_port_error_raises():
732+
"""Bootstrap fails immediately when serial port shows '[iris-init] ERROR'."""
733+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
734+
handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service)
735+
worker_config = config_pb2.WorkerConfig(port=10001)
736+
737+
gcp_service.set_serial_port_output(
738+
vm_name,
739+
"us-central2-b",
740+
"[iris-init] ERROR: Docker pull failed\n",
741+
)
742+
743+
with unittest.mock.patch(
744+
"iris.cluster.providers.gcp.workers._probe_worker_health",
745+
return_value=False,
746+
):
747+
with pytest.raises(InfraError, match="bootstrap failed"):
748+
_run_vm_slice_bootstrap(
749+
gcp_service,
750+
handle,
751+
worker_config,
752+
poll_interval=0.01,
753+
cloud_ready_timeout=5.0,
754+
bootstrap_timeout=5.0,
755+
)
756+
757+
758+
def test_vm_bootstrap_phase2_has_independent_timeout():
759+
"""Phase 2 uses its own timeout, not the remainder from phase 1."""
760+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
761+
handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service)
762+
worker_config = config_pb2.WorkerConfig(port=10001)
763+
764+
# Health probe never succeeds, serial port never shows complete.
765+
# With a very short bootstrap_timeout, this should fail with phase 2 message.
766+
with unittest.mock.patch(
767+
"iris.cluster.providers.gcp.workers._probe_worker_health",
768+
return_value=False,
769+
):
770+
with pytest.raises(InfraError, match=r"bootstrap did not complete within 0\.05s"):
771+
_run_vm_slice_bootstrap(
772+
gcp_service,
773+
handle,
774+
worker_config,
775+
poll_interval=0.01,
776+
cloud_ready_timeout=600.0,
777+
bootstrap_timeout=0.05,
778+
)
779+
780+
781+
def test_vm_bootstrap_cloud_not_ready_raises_phase1_timeout():
782+
"""Phase 1 timeout triggers when VM never reaches READY."""
783+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
784+
785+
# Create a VM but set it to non-READY state
786+
from iris.cluster.providers.gcp.service import VmCreateRequest
787+
788+
vm_name = "test-stuck-vm"
789+
gcp_service.vm_create(
790+
VmCreateRequest(
791+
name=vm_name,
792+
zone="us-central2-b",
793+
machine_type="n2-standard-4",
794+
labels={Labels("iris").iris_slice_id: vm_name},
795+
)
796+
)
797+
# Set VM to STAGING so it never reaches READY
798+
gcp_service._vms[(vm_name, "us-central2-b")].status = "STAGING"
799+
800+
handle = GcpVmSliceHandle(
801+
_slice_id=vm_name,
802+
_vm_name=vm_name,
803+
_zone="us-central2-b",
804+
_project_id="test-project",
805+
_gcp_service=gcp_service,
806+
_labels={Labels("iris").iris_slice_id: vm_name},
807+
_created_at=Timestamp.now(),
808+
_label_prefix="iris",
809+
_bootstrapping=True,
810+
)
811+
worker_config = config_pb2.WorkerConfig(port=10001)
812+
813+
with pytest.raises(InfraError, match=r"did not reach cloud READY within 0\.05s"):
814+
_run_vm_slice_bootstrap(
815+
gcp_service,
816+
handle,
817+
worker_config,
818+
poll_interval=0.01,
819+
cloud_ready_timeout=0.05,
820+
bootstrap_timeout=300.0,
821+
)

0 commit comments

Comments
 (0)