Deduplicate unavailability handling into poll_with_retries in errors.py

github-actions[bot] · rjpower · claude · github-actions[bot] · commit 5f44126ddf95 · 2026-03-26T17:00:16.000Z
Move the retry-with-backoff-on-unavailable logic from wait_for_job and
wait_for_job_with_streaming into a shared poll_with_retries() function in
the errors.py retry library. The new function respects the caller's
deadline — if timeout expires during controller unavailability, it raises
TimeoutError instead of continuing to retry for the full tolerance window.

Co-authored-by: Russell Power &lt;rjpower@users.noreply.github.com&gt;
Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/lib/iris/src/iris/cluster/client/remote_client.py b/lib/iris/src/iris/cluster/client/remote_client.py
@@ -17,7 +17,7 @@
 from iris.cluster.types import Entrypoint, EnvironmentSpec, JobName, TaskAttempt, adjust_tpu_replicas, is_job_finished
 from iris.rpc import cluster_pb2
 from iris.rpc.cluster_connect import ControllerServiceClientSync
-from iris.rpc.errors import call_with_retry, format_connect_error, is_retryable_error
+from iris.rpc.errors import call_with_retry, format_connect_error, poll_with_retries
 from iris.time_utils import Deadline, Duration, ExponentialBackoff
 
 logger = logging.getLogger(__name__)
@@ -139,8 +139,9 @@ def wait_for_job(
         """Wait for job to complete with exponential backoff polling.
 
         If the controller becomes unavailable, retries with backoff for up to
-        ``CONTROLLER_UNAVAILABLE_TOLERANCE`` seconds before giving up. The
-        unavailable timer resets each time a status check succeeds.
+        ``CONTROLLER_UNAVAILABLE_TOLERANCE`` seconds or until the caller's
+        *timeout* expires — whichever comes first. The unavailability timer
+        resets each time a status check succeeds.
 
         Args:
             job_id: Full job ID
@@ -155,46 +156,14 @@ def wait_for_job(
         """
         deadline = Deadline.from_seconds(timeout)
         backoff = ExponentialBackoff(initial=0.1, maximum=poll_interval)
-        unavailable_backoff = ExponentialBackoff(initial=1.0, maximum=60.0, factor=2.0)
-        unavailable_since: float | None = None
 
         while True:
-            try:
-                job_info = self.get_job_status(job_id)
-            except Exception as e:
-                if not is_retryable_error(e):
-                    raise
-                now = time.monotonic()
-                if unavailable_since is None:
-                    unavailable_since = now
-                elapsed_unavailable = now - unavailable_since
-                if elapsed_unavailable >= CONTROLLER_UNAVAILABLE_TOLERANCE:
-                    logger.error(
-                        "Controller unavailable for %.0fs, giving up on %s",
-                        elapsed_unavailable,
-                        job_id,
-                    )
-                    raise
-                logger.warning(
-                    "Controller unavailable for %s (%.0fs), job is still running server-side: %s",
-                    job_id,
-                    elapsed_unavailable,
-                    e,
-                )
-                interval = unavailable_backoff.next_interval()
-                time.sleep(min(interval, deadline.remaining_seconds()))
-                continue
-
-            # Controller responded — reset unavailability tracking.
-            if unavailable_since is not None:
-                elapsed_unavailable = time.monotonic() - unavailable_since
-                logger.info(
-                    "Controller back online for %s after %.0fs of unavailability",
-                    job_id,
-                    elapsed_unavailable,
-                )
-                unavailable_since = None
-                unavailable_backoff.reset()
+            job_info = poll_with_retries(
+                str(job_id),
+                lambda: self.get_job_status(job_id),
+                deadline=deadline,
+                unavailable_tolerance=CONTROLLER_UNAVAILABLE_TOLERANCE,
+            )
 
             if is_job_finished(job_info.state):
                 return job_info
@@ -222,9 +191,9 @@ def wait_for_job_with_streaming(
         credentials and endpoint configuration), avoiding client-side S3 access.
 
         If the controller becomes unavailable, retries with backoff for up to
-        ``CONTROLLER_UNAVAILABLE_TOLERANCE`` seconds before giving up. Log fetch
-        failures are treated the same way — they do not count toward a hard
-        failure limit while the controller is unreachable.
+        ``CONTROLLER_UNAVAILABLE_TOLERANCE`` seconds or until the caller's
+        *timeout* expires — whichever comes first. Log fetch failures are
+        non-fatal — they log a warning but never abort monitoring.
 
         Child job statuses are delivered inline in ``GetTaskLogsResponse`` (when
         *include_children* is True), so detecting state transitions requires no
@@ -236,52 +205,20 @@ def wait_for_job_with_streaming(
         """
         deadline = Deadline.from_seconds(timeout)
         terminal_status: cluster_pb2.JobStatus | None = None
-        unavailable_backoff = ExponentialBackoff(initial=1.0, maximum=60.0, factor=2.0)
-        unavailable_since: float | None = None
         # Track child job states so we fire callbacks once per transition.
         child_job_states: dict[str, int] = {}
         cursor: int = 0
 
         while True:
-            try:
-                status = self.get_job_status(job_id)
-            except Exception as e:
-                if not is_retryable_error(e):
-                    raise
-                now = time.monotonic()
-                if unavailable_since is None:
-                    unavailable_since = now
-                elapsed_unavailable = now - unavailable_since
-                if elapsed_unavailable >= CONTROLLER_UNAVAILABLE_TOLERANCE:
-                    logger.error(
-                        "Controller unavailable for %.0fs, giving up on %s",
-                        elapsed_unavailable,
-                        job_id,
-                    )
-                    raise
-                logger.warning(
-                    "Controller unavailable for %s (%.0fs), job is still running server-side: %s",
-                    job_id,
-                    elapsed_unavailable,
-                    e,
-                )
-                interval = unavailable_backoff.next_interval()
-                time.sleep(min(interval, deadline.remaining_seconds()))
-                continue
+            status = poll_with_retries(
+                str(job_id),
+                lambda: self.get_job_status(job_id),
+                deadline=deadline,
+                unavailable_tolerance=CONTROLLER_UNAVAILABLE_TOLERANCE,
+            )
 
             state_name = cluster_pb2.JobState.Name(status.state)
 
-            # Controller responded — reset unavailability tracking.
-            if unavailable_since is not None:
-                elapsed_unavailable = time.monotonic() - unavailable_since
-                logger.info(
-                    "Controller back online for %s after %.0fs of unavailability",
-                    job_id,
-                    elapsed_unavailable,
-                )
-                unavailable_since = None
-                unavailable_backoff.reset()
-
             try:
                 log_response = self.fetch_task_logs(
                     job_id,
diff --git a/lib/iris/src/iris/rpc/errors.py b/lib/iris/src/iris/rpc/errors.py
@@ -15,7 +15,7 @@
 from google.protobuf.any_pb2 import Any as AnyProto
 
 from iris.rpc import errors_pb2
-from iris.time_utils import ExponentialBackoff, Timestamp
+from iris.time_utils import Deadline, ExponentialBackoff, Timestamp
 
 logger = logging.getLogger(__name__)
 
@@ -220,3 +220,95 @@ def call_with_retry(
 
     assert last_exception is not None
     raise last_exception
+
+
+def poll_with_retries(
+    operation: str,
+    poll_fn: Callable[[], T],
+    *,
+    deadline: Deadline,
+    unavailable_tolerance: float = 3600.0,
+    backoff: ExponentialBackoff | None = None,
+) -> T:
+    """Poll an RPC endpoint, tolerating transient unavailability.
+
+    Calls ``poll_fn`` in a loop.  On retryable errors the function backs off
+    and keeps trying for up to ``unavailable_tolerance`` seconds **or** until
+    ``deadline`` expires — whichever comes first.  When the call succeeds the
+    unavailability timer resets.
+
+    This is designed for monitoring loops (e.g. ``wait_for_job``) where the
+    server-side work continues regardless of client polling failures.
+
+    Args:
+        operation: Human-readable description for log messages.
+        poll_fn: Callable that performs the RPC.  Should raise on failure.
+        deadline: Caller-supplied deadline — polling stops with ``TimeoutError``
+            if the deadline expires, even during unavailability.
+        unavailable_tolerance: Maximum seconds to tolerate continuous
+            controller unavailability before re-raising the RPC error.
+        backoff: Backoff for unavailability retries.  Defaults to 1 s → 60 s.
+
+    Returns:
+        The successful result of ``poll_fn``.
+
+    Raises:
+        TimeoutError: If *deadline* expires while the controller is unavailable.
+        Exception: The last RPC error if unavailability exceeds the tolerance,
+            or any non-retryable error from ``poll_fn``.
+    """
+
+    if backoff is None:
+        backoff = ExponentialBackoff(initial=1.0, maximum=60.0, factor=2.0)
+    else:
+        backoff = backoff.copy()
+
+    unavailable_since: float | None = None
+
+    while True:
+        try:
+            result = poll_fn()
+        except Exception as e:
+            if not is_retryable_error(e):
+                raise
+
+            now = time.monotonic()
+            if unavailable_since is None:
+                unavailable_since = now
+            elapsed_unavailable = now - unavailable_since
+
+            if elapsed_unavailable >= unavailable_tolerance:
+                logger.error(
+                    "Controller unavailable for %.0fs, giving up on %s",
+                    elapsed_unavailable,
+                    operation,
+                )
+                raise
+
+            if deadline.expired():
+                raise TimeoutError(
+                    f"{operation}: deadline expired after {elapsed_unavailable:.0f}s of controller unavailability"
+                ) from e
+
+            logger.warning(
+                "Controller unavailable for %s (%.0fs), job is still running server-side: %s",
+                operation,
+                elapsed_unavailable,
+                e,
+            )
+            interval = backoff.next_interval()
+            time.sleep(min(interval, deadline.remaining_seconds()))
+            continue
+
+        # Success — reset unavailability tracking.
+        if unavailable_since is not None:
+            elapsed_unavailable = time.monotonic() - unavailable_since
+            logger.info(
+                "Controller back online for %s after %.0fs of unavailability",
+                operation,
+                elapsed_unavailable,
+            )
+            unavailable_since = None
+            backoff.reset()
+
+        return result
diff --git a/lib/iris/tests/rpc/test_errors.py b/lib/iris/tests/rpc/test_errors.py
@@ -12,8 +12,9 @@
     connect_error_sanitized,
     connect_error_with_traceback,
     extract_error_details,
+    poll_with_retries,
 )
-from iris.time_utils import ExponentialBackoff
+from iris.time_utils import Deadline, ExponentialBackoff
 
 
 def test_connect_error_with_traceback_populates_timestamp() -> None:
@@ -172,3 +173,86 @@ def fail_then_succeed():
     )
     assert result == "recovered"
     assert call_count == 4
+
+
+# -- poll_with_retries tests --
+
+
+def test_poll_with_retries_succeeds_immediately() -> None:
+    result = poll_with_retries(
+        "test",
+        lambda: "ok",
+        deadline=Deadline.from_seconds(5.0),
+    )
+    assert result == "ok"
+
+
+def test_poll_with_retries_retries_then_succeeds() -> None:
+    call_count = 0
+
+    def flaky():
+        nonlocal call_count
+        call_count += 1
+        if call_count <= 2:
+            raise ConnectError(Code.UNAVAILABLE, "down")
+        return "recovered"
+
+    result = poll_with_retries(
+        "test",
+        flaky,
+        deadline=Deadline.from_seconds(5.0),
+        backoff=ExponentialBackoff(initial=0.01, maximum=0.05),
+    )
+    assert result == "recovered"
+    assert call_count == 3
+
+
+def test_poll_with_retries_respects_deadline() -> None:
+    """Deadline expiry during unavailability raises TimeoutError, not the RPC error."""
+
+    def always_fail():
+        raise ConnectError(Code.UNAVAILABLE, "down")
+
+    with pytest.raises(TimeoutError, match="deadline expired"):
+        poll_with_retries(
+            "test",
+            always_fail,
+            deadline=Deadline.from_seconds(0.3),
+            unavailable_tolerance=3600.0,
+            backoff=ExponentialBackoff(initial=0.01, maximum=0.05),
+        )
+
+
+def test_poll_with_retries_respects_unavailable_tolerance() -> None:
+    """Unavailability tolerance expiry re-raises the RPC error."""
+
+    def always_fail():
+        raise ConnectError(Code.UNAVAILABLE, "down")
+
+    with pytest.raises(ConnectError) as exc_info:
+        poll_with_retries(
+            "test",
+            always_fail,
+            deadline=Deadline.from_seconds(10.0),
+            unavailable_tolerance=0.3,
+            backoff=ExponentialBackoff(initial=0.01, maximum=0.05),
+        )
+    assert exc_info.value.code == Code.UNAVAILABLE
+
+
+def test_poll_with_retries_raises_non_retryable_immediately() -> None:
+    call_count = 0
+
+    def not_found():
+        nonlocal call_count
+        call_count += 1
+        raise ConnectError(Code.NOT_FOUND, "gone")
+
+    with pytest.raises(ConnectError) as exc_info:
+        poll_with_retries(
+            "test",
+            not_found,
+            deadline=Deadline.from_seconds(5.0),
+        )
+    assert exc_info.value.code == Code.NOT_FOUND
+    assert call_count == 1