Skip to content

Commit eea25a4

Browse files
committed
[iris] Fix VM bootstrap monitor killing healthy workers
Split the shared 600s deadline into independent phase 1 (cloud READY, 600s) and phase 2 (bootstrap complete, 300s) timeouts so a slow VM startup cannot starve the bootstrap check. Add HTTP health probing as the primary bootstrap completion signal instead of relying solely on GCE serial port text matching, which is fragile due to buffering and byte/char offset drift. Serial port polling is retained for diagnostics and error detection. Fixes a bug where a worker registered and accepted tasks successfully, but the bootstrap monitor timed out 8 minutes later because "Bootstrap complete" never appeared in serial port output, killing the worker and its running task.
1 parent 180cb79 commit eea25a4

3 files changed

Lines changed: 242 additions & 23 deletions

File tree

lib/iris/src/iris/cluster/providers/gcp/fake.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ def __init__(
104104
self._label_prefix = label_prefix
105105
self._iris_labels = Labels(label_prefix) if mode == ServiceMode.LOCAL else None
106106

107+
# Serial port output injection for testing bootstrap monitoring
108+
self._serial_port_output: dict[tuple[str, str], str] = {}
109+
107110
# LOCAL mode: track spawned workers per slice for cleanup
108111
self._local_slices: dict[str, LocalSliceHandle] = {}
109112

@@ -321,10 +324,14 @@ def vm_set_metadata(self, name: str, zone: str, metadata: dict[str, str]) -> Non
321324
raise InfraError(f"VM {name!r} not found in zone {zone!r}")
322325
vm.metadata.update(metadata)
323326

327+
def set_serial_port_output(self, name: str, zone: str, output: str) -> None:
328+
"""Inject serial port output for a VM. Used by tests to simulate GCE serial console."""
329+
self._serial_port_output[(name, zone)] = output
330+
324331
def vm_get_serial_port_output(self, name: str, zone: str, start: int = 0) -> str:
325332
self._check_injected_failure("vm_get_serial_port_output")
326-
# DRY_RUN / LOCAL: return empty string (no serial console)
327-
return ""
333+
full_output = self._serial_port_output.get((name, zone), "")
334+
return full_output[start:]
328335

329336
# ========================================================================
330337
# LOCAL mode: worker spawning

lib/iris/src/iris/cluster/providers/gcp/workers.py

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -640,24 +640,37 @@ def _fetch_bootstrap_logs(project_id: str, handle: GcpSliceHandle) -> None:
640640
)
641641

642642

643+
def _probe_worker_health(address: str, port: int) -> bool:
644+
"""Probe the worker's HTTP health endpoint. Returns True if healthy."""
645+
try:
646+
resp = urllib.request.urlopen(f"http://{address}:{port}/health", timeout=5)
647+
return resp.status == 200
648+
except (urllib.error.URLError, urllib.error.HTTPError, OSError, TimeoutError):
649+
return False
650+
651+
643652
def _run_vm_slice_bootstrap(
644653
gcp_service: GcpService,
645654
handle: GcpVmSliceHandle,
646655
worker_config: config_pb2.WorkerConfig,
647656
poll_interval: float = 5.0,
648657
cloud_ready_timeout: float = 600.0,
658+
bootstrap_timeout: float = 300.0,
649659
) -> None:
650-
"""Monitor GCE startup-script bootstrap via serial port output.
660+
"""Monitor GCE startup-script bootstrap via health probe and serial port.
661+
662+
Phase 1: Wait for the VM to reach cloud READY with an internal IP.
663+
Phase 2: Poll the worker's /health endpoint (primary signal) and serial
664+
port output (secondary signal / diagnostics) until bootstrap completes.
651665
652-
The bootstrap script was baked into VM metadata at creation time, so the
653-
VM self-bootstraps on first boot. This method polls serial port output
654-
for [iris-init] log lines until the script emits ``Bootstrap complete``
655-
or the timeout expires.
666+
Each phase has its own independent timeout to prevent a slow phase 1
667+
from starving phase 2.
656668
"""
657-
deadline = Deadline.from_now(Duration.from_seconds(cloud_ready_timeout))
669+
cloud_deadline = Deadline.from_now(Duration.from_seconds(cloud_ready_timeout))
658670
poll_duration = Duration.from_seconds(poll_interval)
659671

660-
while not deadline.expired():
672+
# Phase 1: wait for cloud READY with an internal IP
673+
while not cloud_deadline.expired():
661674
cloud_status = handle._describe_cloud()
662675
if cloud_status.state in (CloudSliceState.FAILED, CloudSliceState.DELETING):
663676
raise InfraError(f"VM slice {handle.slice_id} entered {cloud_status.state} while waiting for cloud READY")
@@ -668,32 +681,45 @@ def _run_vm_slice_bootstrap(
668681
else:
669682
raise InfraError(f"VM slice {handle.slice_id} did not reach cloud READY within {cloud_ready_timeout}s")
670683

684+
worker_address = cloud_status.workers[0].internal_address
685+
worker_port = worker_config.port
686+
687+
# Phase 2: poll health endpoint + serial port with a fresh deadline
688+
bootstrap_deadline = Deadline.from_now(Duration.from_seconds(bootstrap_timeout))
671689
serial_offset = 0
672-
bootstrap_complete = False
673-
bootstrap_failed = False
674690

675-
while not deadline.expired():
691+
while not bootstrap_deadline.expired():
692+
# Primary signal: HTTP health probe
693+
if _probe_worker_health(worker_address, worker_port):
694+
logger.info("Worker health probe succeeded for VM slice %s", handle.slice_id)
695+
break
696+
697+
# Secondary signal: serial port for diagnostics and error detection
676698
output = gcp_service.vm_get_serial_port_output(handle._vm_name, handle._zone, start=serial_offset)
677699
if output:
700+
serial_offset += len(output)
701+
bootstrap_complete = False
702+
bootstrap_errored = False
678703
for line in output.splitlines():
679704
if "[iris-init]" in line:
680705
logger.info("[%s serial] %s", handle.slice_id, line.strip())
681706
if "Bootstrap complete" in line:
682707
bootstrap_complete = True
683708
if "[iris-init] ERROR" in line:
684-
bootstrap_failed = True
685-
686-
serial_offset += len(output)
709+
bootstrap_errored = True
687710

688-
if bootstrap_complete:
689-
break
690-
if bootstrap_failed:
691-
raise InfraError(f"Startup-script bootstrap failed for VM slice {handle.slice_id} (see serial output above)")
711+
if bootstrap_complete:
712+
logger.info("Serial port reports bootstrap complete for VM slice %s", handle.slice_id)
713+
break
714+
if bootstrap_errored:
715+
raise InfraError(
716+
f"Startup-script bootstrap failed for VM slice {handle.slice_id} (see serial output above)"
717+
)
692718

693719
time.sleep(poll_duration.to_seconds())
694720
else:
695-
raise InfraError(f"VM slice {handle.slice_id} startup-script did not complete within {cloud_ready_timeout}s")
721+
raise InfraError(f"VM slice {handle.slice_id} bootstrap did not complete within {bootstrap_timeout}s")
696722

697-
logger.info("Bootstrap completed for VM slice %s (via startup-script)", handle.slice_id)
723+
logger.info("Bootstrap completed for VM slice %s", handle.slice_id)
698724
with handle._bootstrap_lock:
699725
handle._bootstrap_state = CloudSliceState.READY

lib/iris/tests/cluster/providers/gcp/test_platform.py

Lines changed: 188 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,22 @@
1717
import pytest
1818

1919
from iris.cluster.providers.gcp.fake import InMemoryGcpService
20-
from iris.cluster.providers.gcp.handles import _build_gce_resource_name
21-
from iris.cluster.providers.gcp.workers import GcpWorkerProvider, _validate_slice_config
20+
from iris.cluster.providers.gcp.handles import GcpVmSliceHandle, _build_gce_resource_name
21+
from iris.cluster.providers.gcp.workers import (
22+
GcpWorkerProvider,
23+
_run_vm_slice_bootstrap,
24+
_validate_slice_config,
25+
)
2226
from iris.cluster.providers.manual.provider import ManualControllerProvider, ManualWorkerProvider
2327
from iris.cluster.providers.types import (
2428
CloudSliceState,
29+
InfraError,
2530
Labels,
2631
QuotaExhaustedError,
2732
)
2833
from iris.cluster.service_mode import ServiceMode
2934
from iris.rpc import config_pb2
35+
from iris.time_utils import Timestamp
3036

3137
# =============================================================================
3238
# Fixture infrastructure
@@ -633,3 +639,183 @@ def test_gcp_tpu_slice_passes_startup_script_metadata():
633639
assert "startup-script" in metadata
634640
assert "[iris-init]" in metadata["startup-script"]
635641
assert "test-image:latest" in metadata["startup-script"]
642+
643+
644+
# =============================================================================
645+
# Section 6: VM Slice Bootstrap Tests
646+
#
647+
# Tests for _run_vm_slice_bootstrap with split timeouts and health probing.
648+
# =============================================================================
649+
650+
651+
def _make_vm_slice_for_bootstrap(
652+
gcp_service: InMemoryGcpService,
653+
zone: str = "us-central2-b",
654+
) -> tuple[GcpVmSliceHandle, str]:
655+
"""Create a VM in InMemoryGcpService and return a handle + vm_name for bootstrap testing."""
656+
from iris.cluster.providers.gcp.service import VmCreateRequest
657+
658+
vm_name = "test-bootstrap-vm"
659+
gcp_service.vm_create(
660+
VmCreateRequest(
661+
name=vm_name,
662+
zone=zone,
663+
machine_type="n2-standard-4",
664+
labels={Labels("iris").iris_slice_id: vm_name},
665+
)
666+
)
667+
handle = GcpVmSliceHandle(
668+
_slice_id=vm_name,
669+
_vm_name=vm_name,
670+
_zone=zone,
671+
_project_id="test-project",
672+
_gcp_service=gcp_service,
673+
_labels={Labels("iris").iris_slice_id: vm_name},
674+
_created_at=Timestamp.now(),
675+
_label_prefix="iris",
676+
_bootstrapping=True,
677+
)
678+
return handle, vm_name
679+
680+
681+
def test_vm_bootstrap_health_probe_succeeds_without_serial_port():
682+
"""Bootstrap completes when health probe succeeds, even if serial port never shows 'Bootstrap complete'."""
683+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
684+
handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service)
685+
worker_config = config_pb2.WorkerConfig(port=10001)
686+
687+
with unittest.mock.patch(
688+
"iris.cluster.providers.gcp.workers._probe_worker_health",
689+
return_value=True,
690+
):
691+
_run_vm_slice_bootstrap(
692+
gcp_service,
693+
handle,
694+
worker_config,
695+
poll_interval=0.01,
696+
cloud_ready_timeout=5.0,
697+
bootstrap_timeout=5.0,
698+
)
699+
700+
assert handle._bootstrap_state == CloudSliceState.READY
701+
702+
703+
def test_vm_bootstrap_serial_port_succeeds_without_health_probe():
704+
"""Bootstrap completes via serial port 'Bootstrap complete' when health probe fails."""
705+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
706+
handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service)
707+
worker_config = config_pb2.WorkerConfig(port=10001)
708+
709+
gcp_service.set_serial_port_output(
710+
vm_name,
711+
"us-central2-b",
712+
"[iris-init] Starting bootstrap\n[iris-init] Bootstrap complete\n",
713+
)
714+
715+
with unittest.mock.patch(
716+
"iris.cluster.providers.gcp.workers._probe_worker_health",
717+
return_value=False,
718+
):
719+
_run_vm_slice_bootstrap(
720+
gcp_service,
721+
handle,
722+
worker_config,
723+
poll_interval=0.01,
724+
cloud_ready_timeout=5.0,
725+
bootstrap_timeout=5.0,
726+
)
727+
728+
assert handle._bootstrap_state == CloudSliceState.READY
729+
730+
731+
def test_vm_bootstrap_serial_port_error_raises():
732+
"""Bootstrap fails immediately when serial port shows '[iris-init] ERROR'."""
733+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
734+
handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service)
735+
worker_config = config_pb2.WorkerConfig(port=10001)
736+
737+
gcp_service.set_serial_port_output(
738+
vm_name,
739+
"us-central2-b",
740+
"[iris-init] ERROR: Docker pull failed\n",
741+
)
742+
743+
with unittest.mock.patch(
744+
"iris.cluster.providers.gcp.workers._probe_worker_health",
745+
return_value=False,
746+
):
747+
with pytest.raises(InfraError, match="bootstrap failed"):
748+
_run_vm_slice_bootstrap(
749+
gcp_service,
750+
handle,
751+
worker_config,
752+
poll_interval=0.01,
753+
cloud_ready_timeout=5.0,
754+
bootstrap_timeout=5.0,
755+
)
756+
757+
758+
def test_vm_bootstrap_phase2_has_independent_timeout():
759+
"""Phase 2 uses its own timeout, not the remainder from phase 1."""
760+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
761+
handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service)
762+
worker_config = config_pb2.WorkerConfig(port=10001)
763+
764+
# Health probe never succeeds, serial port never shows complete.
765+
# With a very short bootstrap_timeout, this should fail with phase 2 message.
766+
with unittest.mock.patch(
767+
"iris.cluster.providers.gcp.workers._probe_worker_health",
768+
return_value=False,
769+
):
770+
with pytest.raises(InfraError, match=r"bootstrap did not complete within 0\.05s"):
771+
_run_vm_slice_bootstrap(
772+
gcp_service,
773+
handle,
774+
worker_config,
775+
poll_interval=0.01,
776+
cloud_ready_timeout=600.0,
777+
bootstrap_timeout=0.05,
778+
)
779+
780+
781+
def test_vm_bootstrap_cloud_not_ready_raises_phase1_timeout():
782+
"""Phase 1 timeout triggers when VM never reaches READY."""
783+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
784+
785+
# Create a VM but set it to non-READY state
786+
from iris.cluster.providers.gcp.service import VmCreateRequest
787+
788+
vm_name = "test-stuck-vm"
789+
gcp_service.vm_create(
790+
VmCreateRequest(
791+
name=vm_name,
792+
zone="us-central2-b",
793+
machine_type="n2-standard-4",
794+
labels={Labels("iris").iris_slice_id: vm_name},
795+
)
796+
)
797+
# Set VM to STAGING so it never reaches READY
798+
gcp_service._vms[(vm_name, "us-central2-b")].status = "STAGING"
799+
800+
handle = GcpVmSliceHandle(
801+
_slice_id=vm_name,
802+
_vm_name=vm_name,
803+
_zone="us-central2-b",
804+
_project_id="test-project",
805+
_gcp_service=gcp_service,
806+
_labels={Labels("iris").iris_slice_id: vm_name},
807+
_created_at=Timestamp.now(),
808+
_label_prefix="iris",
809+
_bootstrapping=True,
810+
)
811+
worker_config = config_pb2.WorkerConfig(port=10001)
812+
813+
with pytest.raises(InfraError, match=r"did not reach cloud READY within 0\.05s"):
814+
_run_vm_slice_bootstrap(
815+
gcp_service,
816+
handle,
817+
worker_config,
818+
poll_interval=0.01,
819+
cloud_ready_timeout=0.05,
820+
bootstrap_timeout=300.0,
821+
)

0 commit comments

Comments
 (0)