Skip to content

Commit 2b25aed

Browse files
fix(iris): add --async to gcloud delete/reset to unblock heartbeat thread
Synchronous gcloud subprocess calls (especially `tpu-vm delete`, 10-18s) were blocking the heartbeat thread, stalling all task dispatch cluster-wide for minutes at a time. Add `--async` flag to all gcloud delete and reset commands so they return immediately after submitting the operation to GCP. All callers already handle terminate() failures gracefully (catch + log), so async deletion is safe. Create and describe commands remain synchronous since their results are needed immediately. Fixes #3678 Co-authored-by: Russell Power <rjpower@users.noreply.github.com>
1 parent a2d6a05 commit 2b25aed

File tree

1 file changed

+13
-5
lines changed
  • lib/iris/src/iris/cluster/platform

1 file changed

+13
-5
lines changed

lib/iris/src/iris/cluster/platform/gcp.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ def reboot(self) -> None:
373373
f"--project={self._project_id}",
374374
f"--zone={self._zone}",
375375
"--quiet",
376+
"--async",
376377
]
377378
logger.info("Rebooting GCE instance: %s", self._gce_vm_name)
378379
logger.info("gcloud command: %s", cmd)
@@ -388,8 +389,9 @@ def terminate(self) -> None:
388389
f"--project={self._project_id}",
389390
f"--zone={self._zone}",
390391
"--quiet",
392+
"--async",
391393
]
392-
logger.info("Deleting GCE instance: %s", self._gce_vm_name)
394+
logger.info("Deleting GCE instance (async): %s", self._gce_vm_name)
393395
logger.info("gcloud command: %s", cmd)
394396
result = subprocess.run(cmd, capture_output=True, text=True)
395397
if result.returncode != 0:
@@ -585,8 +587,9 @@ def terminate(self) -> None:
585587
f"--zone={self._zone}",
586588
f"--project={self._project_id}",
587589
"--quiet",
590+
"--async",
588591
]
589-
logger.info("Terminating TPU: %s", self._slice_id)
592+
logger.info("Terminating TPU (async): %s", self._slice_id)
590593
logger.info("gcloud command: %s", cmd)
591594
result = subprocess.run(cmd, capture_output=True, text=True)
592595
if result.returncode != 0:
@@ -722,8 +725,9 @@ def terminate(self) -> None:
722725
f"--project={self._project_id}",
723726
f"--zone={self._zone}",
724727
"--quiet",
728+
"--async",
725729
]
726-
logger.info("Terminating VM slice: %s (vm=%s)", self._slice_id, self._vm_name)
730+
logger.info("Terminating VM slice (async): %s (vm=%s)", self._slice_id, self._vm_name)
727731
logger.info("gcloud command: %s", cmd)
728732
result = subprocess.run(cmd, capture_output=True, text=True)
729733
if result.returncode != 0:
@@ -785,6 +789,7 @@ def _best_effort_delete_tpu(self, slice_id: str, zone: str) -> None:
785789
"""Try to delete a TPU VM that may have been partially created.
786790
787791
Silently ignores "not found" errors (resource was never created).
792+
Uses --async so the caller is not blocked waiting for deletion.
788793
"""
789794
cmd = [
790795
"gcloud",
@@ -796,8 +801,9 @@ def _best_effort_delete_tpu(self, slice_id: str, zone: str) -> None:
796801
f"--zone={zone}",
797802
f"--project={self._project_id}",
798803
"--quiet",
804+
"--async",
799805
]
800-
logger.info("Best-effort cleanup of TPU %s in %s", slice_id, zone)
806+
logger.info("Best-effort async cleanup of TPU %s in %s", slice_id, zone)
801807
result = subprocess.run(cmd, capture_output=True, text=True)
802808
if result.returncode != 0:
803809
error = result.stderr.strip()
@@ -808,6 +814,7 @@ def _best_effort_delete_vm(self, vm_name: str, zone: str) -> None:
808814
"""Try to delete a GCE VM that may have been partially created.
809815
810816
Silently ignores "not found" errors (resource was never created).
817+
Uses --async so the caller is not blocked waiting for deletion.
811818
"""
812819
cmd = [
813820
"gcloud",
@@ -818,8 +825,9 @@ def _best_effort_delete_vm(self, vm_name: str, zone: str) -> None:
818825
f"--zone={zone}",
819826
f"--project={self._project_id}",
820827
"--quiet",
828+
"--async",
821829
]
822-
logger.info("Best-effort cleanup of VM %s in %s", vm_name, zone)
830+
logger.info("Best-effort async cleanup of VM %s in %s", vm_name, zone)
823831
result = subprocess.run(cmd, capture_output=True, text=True)
824832
if result.returncode != 0:
825833
error = result.stderr.strip()

0 commit comments

Comments
 (0)