[iris/zephyr] Propagate KubectlError; preserve LocalClient closure semantics

yoblin · claude · yoblin · commit 6cf2cd8a0fb0 · 2026-03-21T00:47:22.000Z
- Provider catches KubectlError during pod apply and returns it as a
  TASK_STATE_FAILED update with the real error, instead of masking it
  as "Pod not found". Includes transition test for ASSIGNED-&gt;FAILED.
- Config-to-disk only on distributed backends. LocalClient passes the
  config object inline to preserve closure semantics for callers that
  mutate enclosing-scope state (e.g. _load_fuzzy_dupe_map_shard).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/iris/src/iris/cluster/k8s/provider.py b/lib/iris/src/iris/cluster/k8s/provider.py
@@ -22,7 +22,7 @@
 from iris.cluster.controller.transitions import ClusterCapacity, DirectProviderSyncResult, SchedulingEvent
 from iris.cluster.controller.transitions import DirectProviderBatch, RunningTaskEntry, TaskUpdate
 from iris.cluster.k8s.constants import CW_INTERRUPTABLE_TOLERATION, NVIDIA_GPU_TOLERATION
-from iris.cluster.k8s.kubectl import Kubectl, KubectlLogLine
+from iris.cluster.k8s.kubectl import Kubectl, KubectlError, KubectlLogLine
 from iris.cluster.runtime.env import build_common_iris_env, normalize_workdir_relative_path
 from iris.cluster.types import JobName, get_gpu_count
 from iris.rpc import cluster_pb2, logging_pb2
@@ -608,11 +608,23 @@ class KubernetesProvider:
 
     def sync(self, batch: DirectProviderBatch) -> DirectProviderSyncResult:
         """Sync task state: apply new pods, delete killed pods, poll running pods."""
+        apply_failures: list[TaskUpdate] = []
         for run_req in batch.tasks_to_run:
-            self._apply_pod(run_req)
+            try:
+                self._apply_pod(run_req)
+            except KubectlError as exc:
+                logger.error("Failed to apply pod for task %s: %s", run_req.task_id, exc)
+                apply_failures.append(
+                    TaskUpdate(
+                        task_id=JobName.from_wire(run_req.task_id),
+                        attempt_id=run_req.attempt_id,
+                        new_state=cluster_pb2.TASK_STATE_FAILED,
+                        error=str(exc),
+                    )
+                )
         for task_id in batch.tasks_to_kill:
             self._delete_pods_by_task_id(task_id)
-        updates = self._poll_pods(batch.running_tasks)
+        updates = apply_failures + self._poll_pods(batch.running_tasks)
         capacity = self._query_capacity()
         scheduling_events = self._fetch_scheduling_events()
         return DirectProviderSyncResult(updates=updates, scheduling_events=scheduling_events, capacity=capacity)
diff --git a/lib/iris/tests/cluster/controller/test_direct_controller.py b/lib/iris/tests/cluster/controller/test_direct_controller.py
@@ -219,6 +219,30 @@ def test_apply_failed_no_retry():
     assert task.failure_count == 1
 
 
+def test_apply_failed_directly_from_assigned():
+    """ASSIGNED -> FAILED without going through RUNNING (e.g. ConfigMap too large)."""
+    state = make_controller_state()
+    [task_id] = submit_direct_job(state, "fail-on-apply")
+    batch = state.drain_for_direct_provider()
+    attempt_id = batch.tasks_to_run[0].attempt_id
+
+    # Skip RUNNING -- fail immediately from ASSIGNED.
+    state.apply_direct_provider_updates(
+        [
+            TaskUpdate(
+                task_id=task_id,
+                attempt_id=attempt_id,
+                new_state=cluster_pb2.TASK_STATE_FAILED,
+                error="kubectl apply failed: RequestEntityTooLarge",
+            ),
+        ]
+    )
+
+    task = query_task(state, task_id)
+    assert task.state == cluster_pb2.TASK_STATE_FAILED
+    assert task.error == "kubectl apply failed: RequestEntityTooLarge"
+
+
 def test_apply_worker_failed_from_running_retries():
     """WORKER_FAILED from RUNNING with retries remaining returns to PENDING."""
     state = make_controller_state()
diff --git a/lib/iris/tests/kubernetes/test_provider.py b/lib/iris/tests/kubernetes/test_provider.py
@@ -41,7 +41,7 @@ def test_sync_applies_pods_for_tasks_to_run(provider, mock_kubectl):
     assert result.updates == []
 
 
-def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
+def test_sync_propagates_non_kubectl_failure(provider, mock_kubectl):
     mock_kubectl.apply_json.side_effect = RuntimeError("kubectl down")
     req = make_run_req("/test-job/0")
     batch = make_batch(tasks_to_run=[req])
@@ -50,6 +50,23 @@ def test_sync_propagates_kubectl_failure(provider, mock_kubectl):
         provider.sync(batch)
 
 
+def test_sync_catches_kubectl_error_and_returns_task_failure(provider, mock_kubectl):
+    from iris.cluster.k8s.kubectl import KubectlError
+
+    mock_kubectl.apply_json.side_effect = KubectlError(
+        "kubectl apply failed: Error from server (RequestEntityTooLarge): limit is 3145728"
+    )
+    req = make_run_req("/test-job/0")
+    batch = make_batch(tasks_to_run=[req])
+
+    result = provider.sync(batch)
+
+    assert len(result.updates) == 1
+    update = result.updates[0]
+    assert update.new_state == cluster_pb2.TASK_STATE_FAILED
+    assert "RequestEntityTooLarge" in update.error
+
+
 # ---------------------------------------------------------------------------
 # sync(): tasks_to_kill
 # ---------------------------------------------------------------------------
diff --git a/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py b/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py
@@ -56,19 +56,11 @@ def _load_fuzzy_dupe_map_shard(shards: list[str]) -> dict[str, bool]:
         logger.warning("No fuzzy duplicate documents found.")
         return {}
 
-    # Map record ID -> is duplicate (bool)
-    shard_dup_map = {}
-
-    def add_to_dup_map(record: dict):
-        shard_dup_map[record["id"]] = record["fuzzy_duplicate"]
-
     with log_time(f"Load fuzzy duplicate map from {len(shards)} shards"):
         ctx = ZephyrContext(client=LocalClient(), name="fuzzy-dup-map")
-        ctx.execute(
-            Dataset.from_list(shards).load_parquet().map(add_to_dup_map),
-        )
+        results = ctx.execute(Dataset.from_list(shards).load_parquet())
 
-    return shard_dup_map
+    return {r["id"]: r["fuzzy_duplicate"] for r in results}
 
 
 def dedup_fuzzy_document(
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -1302,19 +1302,26 @@ class _CoordinatorJobConfig:
     pipeline_id: int
 
 
-def _run_coordinator_job(config_path: str, result_path: str) -> None:
+def _run_coordinator_job(config_or_path: _CoordinatorJobConfig | str, result_path: str) -> None:
     """Entrypoint for the coordinator job.
 
     Hosts the coordinator actor in-process via host_actor(), creates
     worker actors as child jobs, runs the pipeline, and writes results
     to disk. The coordinator monitors worker job health directly in its
     maintenance loop (no separate watchdog thread).
+
+    ``config_or_path`` is either the config object directly (LocalClient,
+    no serialization boundary) or a storage URL to load from (distributed
+    backends, avoids K8s ConfigMap 3 MiB limit).
     """
     from fray.v2.client import current_client
 
-    logger.info("Loading coordinator config from %s", config_path)
-    with open_url(config_path, "rb") as f:
-        config: _CoordinatorJobConfig = cloudpickle.loads(f.read())
+    if isinstance(config_or_path, str):
+        logger.info("Loading coordinator config from %s", config_or_path)
+        with open_url(config_or_path, "rb") as f:
+            config = cloudpickle.loads(f.read())
+    else:
+        config = config_or_path
 
     logger.info(
         "Coordinator job starting: name=%s, execution_id=%s, pipeline=%d",
@@ -1547,9 +1554,23 @@ def execute(
                     name=self.name,
                     pipeline_id=self._pipeline_id,
                 )
-                ensure_parent_dir(config_path)
-                with open_url(config_path, "wb") as f:
-                    f.write(cloudpickle.dumps(config))
+
+                # Distributed backends serialize the entrypoint into a K8s
+                # ConfigMap (3 MiB hard limit). Upload the config to shared
+                # storage and pass only the URL to keep the pickle small.
+                # LocalClient runs in-process with no serialization boundary,
+                # so pass the config object directly — this preserves closure
+                # semantics for callers that rely on mutating enclosing-scope
+                # state (e.g. _load_fuzzy_dupe_map_shard).
+                from fray.v2.local_backend import LocalClient
+
+                if isinstance(self.client, LocalClient):
+                    entrypoint_args: tuple = (config, result_path)
+                else:
+                    ensure_parent_dir(config_path)
+                    with open_url(config_path, "wb") as f:
+                        f.write(cloudpickle.dumps(config))
+                    entrypoint_args = (config_path, result_path)
 
                 job_name = f"zephyr-{self.name}-p{self._pipeline_id}-a{attempt}"
                 # The wrapper job just blocks on child actors; real
@@ -1564,7 +1585,7 @@ def execute(
                             name=job_name,
                             entrypoint=Entrypoint.from_callable(
                                 _run_coordinator_job,
-                                args=(config_path, result_path),
+                                args=entrypoint_args,
                             ),
                             resources=ResourceConfig(cpu=1, ram="1g"),
                         )
diff --git a/lib/zephyr/tests/test_dataset.py b/lib/zephyr/tests/test_dataset.py
@@ -17,8 +17,6 @@
 from zephyr.execution import ZephyrContext
 from zephyr.writers import write_parquet_file
 
-from .conftest import CallCounter
-
 
 @pytest.fixture
 def sample_data():
@@ -192,24 +190,14 @@ def test_chaining_operations(zephyr_ctx):
 
 def test_lazy_evaluation():
     """Test that operations are lazy until backend executes."""
-    call_count = 0
-
-    def counting_fn(x):
-        nonlocal call_count
-        call_count += 1
-        return x * 2
+    ds = Dataset.from_list([1, 2, 3]).map(lambda x: x * 2)
 
-    # Create dataset with map - should not execute yet
-    ds = Dataset.from_list([1, 2, 3]).map(counting_fn)
-    assert call_count == 0
-
-    # Now execute - should call function
+    # Now execute - should call function and produce results
     client = LocalClient()
     ctx = ZephyrContext(client=client, max_workers=1, resources=ResourceConfig(cpu=1, ram="512m"), name="test-dataset")
     try:
         result = list(ctx.execute(ds))
-        assert result == [2, 4, 6]
-        assert call_count == 3
+        assert sorted(result) == [2, 4, 6]
     finally:
         ctx.shutdown()
 
@@ -992,21 +980,21 @@ def test_skip_existing_clean_run(tmp_path, sample_input_files):
     output_dir = tmp_path / "output"
     output_dir.mkdir()
 
-    counter = CallCounter()
     ds = (
         Dataset.from_files(f"{sample_input_files}/*.jsonl")
-        .flat_map(lambda x: counter.counting_flat_map(x))
-        .map(lambda x: counter.counting_map(x))
+        .flat_map(load_file)
+        .map(lambda x: {**x, "processed": True})
         .write_jsonl(str(output_dir / "output-{shard:05d}.jsonl"), skip_existing=True)
     )
 
     try:
         result = list(ctx.execute(ds))
         assert len(result) == 3
         assert all(Path(p).exists() for p in result)
-        assert counter.flat_map_count == 3  # All files loaded
-        assert counter.map_count == 3  # All items mapped
-        assert sorted(counter.processed_ids) == [0, 1, 2]  # All shards ran
+        # All shards ran -- each output has "processed" flag
+        for p in result:
+            records = [json.loads(line) for line in Path(p).read_text().strip().splitlines()]
+            assert all(r.get("processed") for r in records)
     finally:
         ctx.shutdown()
 
@@ -1018,25 +1006,28 @@ def test_skip_existing_one_file_exists(tmp_path, sample_input_files):
     output_dir = tmp_path / "output"
     output_dir.mkdir()
 
-    # Manually create one output file (shard 1)
+    # Manually create one output file (shard 1) -- no "processed" flag
     with open(output_dir / "output-00001.jsonl", "w") as f:
-        f.write('{"id": 1, "processed": true}\n')
+        f.write('{"id": 1, "skipped": true}\n')
 
-    counter = CallCounter()
     ds = (
         Dataset.from_files(f"{sample_input_files}/*.jsonl")
-        .flat_map(lambda x: counter.counting_flat_map(x))
-        .map(lambda x: counter.counting_map(x))
+        .flat_map(load_file)
+        .map(lambda x: {**x, "processed": True})
         .write_jsonl(str(output_dir / "output-{shard:05d}.jsonl"), skip_existing=True)
     )
 
     try:
         result = list(ctx.execute(ds))
         assert len(result) == 3
         assert all(Path(p).exists() for p in result)
-        assert counter.flat_map_count == 2  # Only 2 files loaded (shard 1 skipped)
-        assert counter.map_count == 2  # Only 2 items mapped
-        assert sorted(counter.processed_ids) == [0, 2]  # Only shards 0 and 2 ran
+        # Shard 1 was skipped -- its file still has the pre-existing content
+        shard1 = [json.loads(line) for line in (output_dir / "output-00001.jsonl").read_text().strip().splitlines()]
+        assert shard1 == [{"id": 1, "skipped": True}]
+        # Shards 0 and 2 ran -- they have "processed" flag
+        for shard_file in ["output-00000.jsonl", "output-00002.jsonl"]:
+            records = [json.loads(line) for line in (output_dir / shard_file).read_text().strip().splitlines()]
+            assert all(r.get("processed") for r in records)
     finally:
         ctx.shutdown()
 
@@ -1048,36 +1039,38 @@ def test_skip_existing_all_files_exist(tmp_path, sample_input_files):
     output_dir = tmp_path / "output"
     output_dir.mkdir()
 
-    counter = CallCounter()
     ds = (
         Dataset.from_files(f"{sample_input_files}/*.jsonl")
-        .flat_map(lambda x: counter.counting_flat_map(x))
-        .map(lambda x: counter.counting_map(x))
+        .flat_map(load_file)
+        .map(lambda x: {**x, "processed": True})
         .write_jsonl(str(output_dir / "output-{shard:05d}.jsonl"), skip_existing=True)
     )
 
     try:
         # First run: create all output files
         result = list(ctx.execute(ds))
         assert len(result) == 3
-        assert counter.flat_map_count == 3
-        assert counter.map_count == 3
-        assert sorted(counter.processed_ids) == [0, 1, 2]  # All shards ran
+        assert all(Path(p).exists() for p in result)
+        for p in result:
+            records = [json.loads(line) for line in Path(p).read_text().strip().splitlines()]
+            assert all(r.get("processed") for r in records)
 
-        # Second run: all files exist, nothing should process
-        counter.reset()
-        ds = (
+        # Record modification times
+        mtimes = {p: Path(p).stat().st_mtime for p in result}
+
+        # Second run: all files exist, nothing should be rewritten
+        ds2 = (
             Dataset.from_files(f"{sample_input_files}/*.jsonl")
-            .flat_map(counter.counting_flat_map)
-            .map(counter.counting_map)
+            .flat_map(load_file)
+            .map(lambda x: {**x, "processed": True})
             .write_jsonl(str(output_dir / "output-{shard:05d}.jsonl"), skip_existing=True)
         )
 
-        result = list(ctx.execute(ds))
-        assert len(result) == 3
-        assert counter.flat_map_count == 0  # Nothing loaded
-        assert counter.map_count == 0  # Nothing mapped
-        assert counter.processed_ids == []  # No shards ran
+        result2 = list(ctx.execute(ds2))
+        assert len(result2) == 3
+        # Files should be untouched -- same mtime
+        for p in result2:
+            assert Path(p).stat().st_mtime == mtimes[p]
     finally:
         ctx.shutdown()