[iris/zephyr] Propagate KubectlError; preserve LocalClient closure semantics

yoblin · claude · yoblin · commit dfdc21f9f498 · 2026-03-21T00:47:43.000Z
- Provider catches KubectlError during pod apply and returns it as a
  TASK_STATE_FAILED update with the real error, instead of masking it
  as "Pod not found". Includes transition test for ASSIGNED-&gt;FAILED.
- Config-to-disk only on distributed backends. LocalClient passes the
  config object inline to preserve closure semantics for callers that
  mutate enclosing-scope state (e.g. _load_fuzzy_dupe_map_shard).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/iris/src/iris/cluster/k8s/provider.py b/lib/iris/src/iris/cluster/k8s/provider.py
@@ -22,7 +22,7 @@
 from iris.cluster.controller.transitions import ClusterCapacity, DirectProviderSyncResult, SchedulingEvent
 from iris.cluster.controller.transitions import DirectProviderBatch, RunningTaskEntry, TaskUpdate
 from iris.cluster.k8s.constants import CW_INTERRUPTABLE_TOLERATION, NVIDIA_GPU_TOLERATION
-from iris.cluster.k8s.kubectl import Kubectl, KubectlLogLine
+from iris.cluster.k8s.kubectl import Kubectl, KubectlError, KubectlLogLine
 from iris.cluster.runtime.env import build_common_iris_env, normalize_workdir_relative_path
 from iris.cluster.types import JobName, get_gpu_count
 from iris.rpc import cluster_pb2, logging_pb2
@@ -608,11 +608,23 @@ class KubernetesProvider:
 
     def sync(self, batch: DirectProviderBatch) -> DirectProviderSyncResult:
         """Sync task state: apply new pods, delete killed pods, poll running pods."""
+        apply_failures: list[TaskUpdate] = []
         for run_req in batch.tasks_to_run:
-            self._apply_pod(run_req)
+            try:
+                self._apply_pod(run_req)
+            except KubectlError as exc:
+                logger.error("Failed to apply pod for task %s: %s", run_req.task_id, exc)
+                apply_failures.append(
+                    TaskUpdate(
+                        task_id=JobName.from_wire(run_req.task_id),
+                        attempt_id=run_req.attempt_id,
+                        new_state=cluster_pb2.TASK_STATE_FAILED,
+                        error=str(exc),
+                    )
+                )
         for task_id in batch.tasks_to_kill:
             self._delete_pods_by_task_id(task_id)
-        updates = self._poll_pods(batch.running_tasks)
+        updates = apply_failures + self._poll_pods(batch.running_tasks)
         capacity = self._query_capacity()
         scheduling_events = self._fetch_scheduling_events()
         return DirectProviderSyncResult(updates=updates, scheduling_events=scheduling_events, capacity=capacity)
diff --git a/lib/iris/tests/cluster/controller/test_direct_controller.py b/lib/iris/tests/cluster/controller/test_direct_controller.py
@@ -219,6 +219,30 @@ def test_apply_failed_no_retry():
     assert task.failure_count == 1
 
 
+def test_apply_failed_directly_from_assigned():
+    """ASSIGNED -> FAILED without going through RUNNING (e.g. ConfigMap too large)."""
+    state = make_controller_state()
+    [task_id] = submit_direct_job(state, "fail-on-apply")
+    batch = state.drain_for_direct_provider()
+    attempt_id = batch.tasks_to_run[0].attempt_id
+
+    # Skip RUNNING -- fail immediately from ASSIGNED.
+    state.apply_direct_provider_updates(
+        [
+            TaskUpdate(
+                task_id=task_id,
+                attempt_id=attempt_id,
+                new_state=cluster_pb2.TASK_STATE_FAILED,
+                error="kubectl apply failed: RequestEntityTooLarge",
+            ),
+        ]
+    )
+
+    task = query_task(state, task_id)
+    assert task.state == cluster_pb2.TASK_STATE_FAILED
+    assert task.error == "kubectl apply failed: RequestEntityTooLarge"
+
+
 def test_apply_worker_failed_from_running_retries():
     """WORKER_FAILED from RUNNING with retries remaining returns to PENDING."""
     state = make_controller_state()
diff --git a/lib/iris/tests/kubernetes/test_provider.py b/lib/iris/tests/kubernetes/test_provider.py
@@ -41,7 +41,7 @@ def test_sync_applies_pods_for_tasks_to_run(provider, mock_kubectl):
     assert result.updates == []
 
 
-def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
+def test_sync_propagates_non_kubectl_failure(provider, mock_kubectl):
     mock_kubectl.apply_json.side_effect = RuntimeError("kubectl down")
     req = make_run_req("/test-job/0")
     batch = make_batch(tasks_to_run=[req])
@@ -50,6 +50,23 @@ def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
         provider.sync(batch)
 
 
+def test_sync_catches_kubectl_error_and_returns_task_failure(provider, mock_kubectl):
+    from iris.cluster.k8s.kubectl import KubectlError
+
+    mock_kubectl.apply_json.side_effect = KubectlError(
+        "kubectl apply failed: Error from server (RequestEntityTooLarge): limit is 3145728"
+    )
+    req = make_run_req("/test-job/0")
+    batch = make_batch(tasks_to_run=[req])
+
+    result = provider.sync(batch)
+
+    assert len(result.updates) == 1
+    update = result.updates[0]
+    assert update.new_state == cluster_pb2.TASK_STATE_FAILED
+    assert "RequestEntityTooLarge" in update.error
+
+
 # ---------------------------------------------------------------------------
 # sync(): tasks_to_kill
 # ---------------------------------------------------------------------------
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -1302,19 +1302,26 @@ class _CoordinatorJobConfig:
     pipeline_id: int
 
 
-def _run_coordinator_job(config_path: str, result_path: str) -> None:
+def _run_coordinator_job(config_or_path: _CoordinatorJobConfig | str, result_path: str) -> None:
     """Entrypoint for the coordinator job.
 
     Hosts the coordinator actor in-process via host_actor(), creates
     worker actors as child jobs, runs the pipeline, and writes results
     to disk. The coordinator monitors worker job health directly in its
     maintenance loop (no separate watchdog thread).
+
+    ``config_or_path`` is either the config object directly (LocalClient,
+    no serialization boundary) or a storage URL to load from (distributed
+    backends, avoids K8s ConfigMap 3 MiB limit).
     """
     from fray.v2.client import current_client
 
-    logger.info("Loading coordinator config from %s", config_path)
-    with open_url(config_path, "rb") as f:
-        config: _CoordinatorJobConfig = cloudpickle.loads(f.read())
+    if isinstance(config_or_path, str):
+        logger.info("Loading coordinator config from %s", config_or_path)
+        with open_url(config_or_path, "rb") as f:
+            config = cloudpickle.loads(f.read())
+    else:
+        config = config_or_path
 
     logger.info(
         "Coordinator job starting: name=%s, execution_id=%s, pipeline=%d",
@@ -1547,9 +1554,23 @@ def execute(
                     name=self.name,
                     pipeline_id=self._pipeline_id,
                 )
-                ensure_parent_dir(config_path)
-                with open_url(config_path, "wb") as f:
-                    f.write(cloudpickle.dumps(config))
+
+                # Distributed backends serialize the entrypoint into a K8s
+                # ConfigMap (3 MiB hard limit). Upload the config to shared
+                # storage and pass only the URL to keep the pickle small.
+                # LocalClient runs in-process with no serialization boundary,
+                # so pass the config object directly — this preserves closure
+                # semantics for callers that rely on mutating enclosing-scope
+                # state (e.g. _load_fuzzy_dupe_map_shard).
+                from fray.v2.local_backend import LocalClient
+
+                if isinstance(self.client, LocalClient):
+                    entrypoint_args: tuple = (config, result_path)
+                else:
+                    ensure_parent_dir(config_path)
+                    with open_url(config_path, "wb") as f:
+                        f.write(cloudpickle.dumps(config))
+                    entrypoint_args = (config_path, result_path)
 
                 job_name = f"zephyr-{self.name}-p{self._pipeline_id}-a{attempt}"
                 # The wrapper job just blocks on child actors; real
@@ -1564,7 +1585,7 @@ def execute(
                             name=job_name,
                             entrypoint=Entrypoint.from_callable(
                                 _run_coordinator_job,
-                                args=(config_path, result_path),
+                                args=entrypoint_args,
                             ),
                             resources=ResourceConfig(cpu=1, ram="1g"),
                         )