[zephyr/iris] Fix ConfigMap size limit for large pipelines (#3908)

yoblin · claude · yoblin · commit e7fd726c2d3e · 2026-03-20T19:24:34.000Z
Two fixes for coordinator-as-job on KubernetesProvider: 1. Provider now catches KubectlError during pod apply and returns it as a TaskUpdate with the real error message, instead of letting it propagate and later surface as misleading "Pod not found". 2. Coordinator config is uploaded to object storage as job-config.pkl. The Entrypoint pickle contains only two string URLs, keeping the K8s ConfigMap payload trivially small regardless of dataset size. Depends on #3919 which fixes closure-mutation semantics in fuzzy dedup so that the cloudpickle round-trip through storage is safe. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/lib/iris/src/iris/cluster/k8s/provider.py b/lib/iris/src/iris/cluster/k8s/provider.py
@@ -22,7 +22,7 @@
 from iris.cluster.controller.transitions import ClusterCapacity, DirectProviderSyncResult, SchedulingEvent
 from iris.cluster.controller.transitions import DirectProviderBatch, RunningTaskEntry, TaskUpdate
 from iris.cluster.k8s.constants import CW_INTERRUPTABLE_TOLERATION, NVIDIA_GPU_TOLERATION
-from iris.cluster.k8s.kubectl import Kubectl, KubectlLogLine
+from iris.cluster.k8s.kubectl import Kubectl, KubectlError, KubectlLogLine
 from iris.cluster.runtime.env import build_common_iris_env, normalize_workdir_relative_path
 from iris.cluster.types import JobName, get_gpu_count
 from iris.rpc import cluster_pb2, logging_pb2
@@ -608,11 +608,23 @@ class KubernetesProvider:
 
     def sync(self, batch: DirectProviderBatch) -> DirectProviderSyncResult:
         """Sync task state: apply new pods, delete killed pods, poll running pods."""
+        apply_failures: list[TaskUpdate] = []
         for run_req in batch.tasks_to_run:
-            self._apply_pod(run_req)
+            try:
+                self._apply_pod(run_req)
+            except KubectlError as exc:
+                logger.error("Failed to apply pod for task %s: %s", run_req.task_id, exc)
+                apply_failures.append(
+                    TaskUpdate(
+                        task_id=JobName.from_wire(run_req.task_id),
+                        attempt_id=run_req.attempt_id,
+                        new_state=cluster_pb2.TASK_STATE_FAILED,
+                        error=str(exc),
+                    )
+                )
         for task_id in batch.tasks_to_kill:
             self._delete_pods_by_task_id(task_id)
-        updates = self._poll_pods(batch.running_tasks)
+        updates = apply_failures + self._poll_pods(batch.running_tasks)
         capacity = self._query_capacity()
         scheduling_events = self._fetch_scheduling_events()
         return DirectProviderSyncResult(updates=updates, scheduling_events=scheduling_events, capacity=capacity)
diff --git a/lib/iris/tests/kubernetes/test_provider.py b/lib/iris/tests/kubernetes/test_provider.py
@@ -41,7 +41,7 @@ def test_sync_applies_pods_for_tasks_to_run(provider, mock_kubectl):
     assert result.updates == []
 
 
-def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
+def test_sync_propagates_non_kubectl_failure(provider, mock_kubectl):
     mock_kubectl.apply_json.side_effect = RuntimeError("kubectl down")
     req = make_run_req("/test-job/0")
     batch = make_batch(tasks_to_run=[req])
@@ -50,6 +50,23 @@ def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
         provider.sync(batch)
 
 
+def test_sync_catches_kubectl_error_and_returns_task_failure(provider, mock_kubectl):
+    from iris.cluster.k8s.kubectl import KubectlError
+
+    mock_kubectl.apply_json.side_effect = KubectlError(
+        "kubectl apply failed: Error from server (RequestEntityTooLarge): limit is 3145728"
+    )
+    req = make_run_req("/test-job/0")
+    batch = make_batch(tasks_to_run=[req])
+
+    result = provider.sync(batch)
+
+    assert len(result.updates) == 1
+    update = result.updates[0]
+    assert update.new_state == cluster_pb2.TASK_STATE_FAILED
+    assert "RequestEntityTooLarge" in update.error
+
+
 # ---------------------------------------------------------------------------
 # sync(): tasks_to_kill
 # ---------------------------------------------------------------------------
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -1301,7 +1301,7 @@ class _CoordinatorJobConfig:
     pipeline_id: int
 
 
-def _run_coordinator_job(config: _CoordinatorJobConfig, result_path: str) -> None:
+def _run_coordinator_job(config_path: str, result_path: str) -> None:
     """Entrypoint for the coordinator job.
 
     Hosts the coordinator actor in-process via host_actor(), creates
@@ -1311,6 +1311,10 @@ def _run_coordinator_job(config: _CoordinatorJobConfig, result_path: str) -> Non
     """
     from fray.v2.client import current_client
 
+    logger.info("Loading coordinator config from %s", config_path)
+    with open_url(config_path, "rb") as f:
+        config: _CoordinatorJobConfig = cloudpickle.loads(f.read())
+
     logger.info(
         "Coordinator job starting: name=%s, execution_id=%s, pipeline=%d",
         config.name,
@@ -1525,6 +1529,7 @@ def execute(
                 "Starting zephyr pipeline: %s (pipeline %d, attempt %d)", execution_id, self._pipeline_id, attempt
             )
 
+            config_path = f"{self.chunk_storage_prefix}/{execution_id}/job-config.pkl"
             result_path = f"{self.chunk_storage_prefix}/{execution_id}/results.pkl"
 
             try:
@@ -1541,6 +1546,9 @@ def execute(
                     name=self.name,
                     pipeline_id=self._pipeline_id,
                 )
+                ensure_parent_dir(config_path)
+                with open_url(config_path, "wb") as f:
+                    f.write(cloudpickle.dumps(config))
 
                 job_name = f"zephyr-{self.name}-p{self._pipeline_id}-a{attempt}"
                 # The wrapper job just blocks on child actors; real
@@ -1550,7 +1558,7 @@ def execute(
                         name=job_name,
                         entrypoint=Entrypoint.from_callable(
                             _run_coordinator_job,
-                            args=(config, result_path),
+                            args=(config_path, result_path),
                         ),
                         resources=ResourceConfig(cpu=1, ram="1g"),
                     )