Skip to content

Commit ddfe6fa

Browse files
committed
Increase reserved TPU cloud startup timeout
1 parent 81a33e2 commit ddfe6fa

File tree

2 files changed

+54
-4
lines changed

2 files changed

+54
-4
lines changed

lib/iris/src/iris/cluster/providers/gcp/workers.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def _run():
7676
DEFAULT_BOOT_DISK_SIZE_GB = 50
7777
# pd-ssd provides ~6000 IOPS vs ~38 on pd-standard, critical for controller DB
7878
DEFAULT_BOOT_DISK_TYPE = "pd-ssd"
79+
RESERVED_TPU_CLOUD_READY_TIMEOUT = 7200.0
7980

8081

8182
def _gcp_instance_metadata(
@@ -692,8 +693,12 @@ def _run_tpu_bootstrap(
692693
Phase 2: Poll worker health endpoints until all respond healthy.
693694
On timeout: query Cloud Logging for [iris-init] entries for diagnostics.
694695
"""
696+
queued_resource_ready_timeout = (
697+
RESERVED_TPU_CLOUD_READY_TIMEOUT if handle.is_queued_resource else cloud_ready_timeout
698+
)
699+
695700
# Single deadline covers Phase 0 (queued resource wait) + Phase 1 (cloud READY).
696-
cloud_deadline = Deadline.from_now(Duration.from_seconds(cloud_ready_timeout))
701+
cloud_deadline = Deadline.from_now(Duration.from_seconds(queued_resource_ready_timeout))
697702

698703
# Phase 0: If this is a queued resource (reserved TPU), wait for ACTIVE
699704
# before polling the TPU VM state. The queued resource may sit in QUEUED
@@ -712,7 +717,7 @@ def _run_tpu_bootstrap(
712717
time.sleep(queued_resource_poll_interval)
713718
else:
714719
raise InfraError(
715-
f"Queued resource {handle.slice_id} did not become ACTIVE " f"within {cloud_ready_timeout}s"
720+
f"Queued resource {handle.slice_id} did not become ACTIVE " f"within {queued_resource_ready_timeout}s"
716721
)
717722

718723
while not cloud_deadline.expired():
@@ -731,7 +736,7 @@ def _run_tpu_bootstrap(
731736
)
732737
time.sleep(poll_interval)
733738
else:
734-
raise InfraError(f"Slice {handle.slice_id} did not reach cloud READY within {cloud_ready_timeout}s")
739+
raise InfraError(f"Slice {handle.slice_id} did not reach cloud READY within {queued_resource_ready_timeout}s")
735740

736741
workers = cloud_status.workers
737742
worker_addrs = [(w.worker_id, w.internal_address) for w in workers]

lib/iris/tests/cluster/providers/gcp/test_platform.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818

1919
from iris.cluster.providers.gcp.controller import GcpControllerProvider
2020
from iris.cluster.providers.gcp.fake import InMemoryGcpService
21-
from iris.cluster.providers.gcp.handles import GcpVmSliceHandle, _build_gce_resource_name
21+
from iris.cluster.providers.gcp.handles import GcpSliceHandle, GcpVmSliceHandle, _build_gce_resource_name
2222
from iris.cluster.providers.remote_exec import DirectSshRemoteExec, GceRemoteExec, GcloudRemoteExec
2323
from iris.cluster.providers.gcp.workers import (
2424
GcpWorkerProvider,
25+
RESERVED_TPU_CLOUD_READY_TIMEOUT,
26+
_run_tpu_bootstrap,
2527
_run_vm_slice_bootstrap,
2628
_validate_slice_config,
2729
)
@@ -929,6 +931,49 @@ def test_gcp_tpu_slice_os_login_prefers_external_ip_for_direct_ssh():
929931
assert status.workers[0]._remote_exec.host == "34.1.2.3"
930932

931933

934+
# =============================================================================
935+
# Section 6: TPU Slice Bootstrap Tests
936+
# =============================================================================
937+
938+
939+
class _ImmediateDeadline:
940+
def expired(self) -> bool:
941+
return True
942+
943+
944+
def test_reserved_tpu_bootstrap_uses_extended_cloud_timeout():
945+
"""Reserved TPU bootstrap uses the longer queued-resource timeout."""
946+
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")
947+
handle = GcpSliceHandle(
948+
_slice_id="test-reserved-tpu",
949+
_zone="us-central2-b",
950+
_project_id="test-project",
951+
_labels={},
952+
_created_at=Timestamp.now(),
953+
_label_prefix="iris",
954+
_accelerator_variant="v4-32",
955+
_gcp_service=gcp_service,
956+
_ssh_config=config_pb2.SshConfig(),
957+
_bootstrapping=True,
958+
_is_queued_resource=True,
959+
)
960+
worker_config = config_pb2.WorkerConfig(port=10001)
961+
seen_deadlines = []
962+
963+
def _fake_deadline_from_now(duration):
964+
seen_deadlines.append(duration.to_seconds())
965+
return _ImmediateDeadline()
966+
967+
with unittest.mock.patch(
968+
"iris.cluster.providers.gcp.workers.Deadline.from_now",
969+
side_effect=_fake_deadline_from_now,
970+
):
971+
with pytest.raises(InfraError, match=rf"within {RESERVED_TPU_CLOUD_READY_TIMEOUT}s"):
972+
_run_tpu_bootstrap(gcp_service, "test-project", handle, worker_config)
973+
974+
assert seen_deadlines == [RESERVED_TPU_CLOUD_READY_TIMEOUT]
975+
976+
932977
# =============================================================================
933978
# Section 6: VM Slice Bootstrap Tests
934979
#

0 commit comments

Comments
 (0)