Fix ZephyrContext not propagating explicit client to coordinator job

ravwojdyla · ravwojdyla · commit 1af1f9ece659 · 2026-03-20T13:06:03.000-07:00
ZephyrContext.execute() submits a coordinator job that calls current_client() to auto-detect the backend. When a ZephyrContext was created with an explicit client (e.g. LocalClient()), the coordinator ignored it and auto-detected Ray, spawning workers as separate processes. This broke side-effect-based patterns like _load_fuzzy_dupe_map_shard where a closure modifies a shared dict -- each Ray actor got a serialized copy, leaving the original empty (zero fuzzy duplicates). Set the context var via set_current_client before submitting the coordinator job so it inherits the ZephyrContext's client. This also reverts the band-aid set_current_client wrappers in fuzzy.py since ZephyrContext now handles propagation. Regression from b1d7828 (Refactor Zephyr coordinator to job).
diff --git a/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py b/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py
@@ -7,7 +7,6 @@
 import logging
 from marin.utils import rebase_file_path
 import pyarrow as pa
-from fray.v2.client import set_current_client
 from fray.v2.local_backend import LocalClient
 from marin.processing.classification.deduplication.dedup_commons import (
     DEFAULT_FILETYPES,
@@ -32,25 +31,23 @@
 
 def _compute_fuzzy_dedup_stats(shards: list[str] | Sequence[str], method: str, level: str) -> DupCounters:
     with log_time(f"Compute fuzzy deduplication stats from {len(shards)} shards"):
-        client = LocalClient()
-        with set_current_client(client):
-            ctx = ZephyrContext(client=client, name="fuzzy-dup-counts")
-            result: DupCounters = ctx.execute(  # type: ignore[bad-assignment]
-                Dataset.from_list(shards)
-                .load_parquet(columns=["component_id"])
-                # Compute the per-component statistics and then roll them up into a single counter group
-                .group_by(
-                    key=lambda r: r["component_id"],
-                    reducer=lambda _, items: DupCounters(
-                        method=method,
-                        level=level,
-                        total=(total := sum(1 for _ in items)),
-                        dups=total if total > 1 else 0,
-                        unique=1,
-                    ),
-                )
-                .reduce(partial(sum, start=DupCounters(method=method, level=level))),
-            )[0]
+        ctx = ZephyrContext(client=LocalClient(), name="fuzzy-dup-counts")
+        result: DupCounters = ctx.execute(  # type: ignore[bad-assignment]
+            Dataset.from_list(shards)
+            .load_parquet(columns=["component_id"])
+            # Compute the per-component statistics and then roll them up into a single counter group
+            .group_by(
+                key=lambda r: r["component_id"],
+                reducer=lambda _, items: DupCounters(
+                    method=method,
+                    level=level,
+                    total=(total := sum(1 for _ in items)),
+                    dups=total if total > 1 else 0,
+                    unique=1,
+                ),
+            )
+            .reduce(partial(sum, start=DupCounters(method=method, level=level))),
+        )[0]
     return result
 
 
@@ -66,12 +63,10 @@ def add_to_dup_map(record: dict):
         shard_dup_map[record["id"]] = record["fuzzy_duplicate"]
 
     with log_time(f"Load fuzzy duplicate map from {len(shards)} shards"):
-        client = LocalClient()
-        with set_current_client(client):
-            ctx = ZephyrContext(client=client, name="fuzzy-dup-map")
-            ctx.execute(
-                Dataset.from_list(shards).load_parquet().map(add_to_dup_map),
-            )
+        ctx = ZephyrContext(client=LocalClient(), name="fuzzy-dup-map")
+        ctx.execute(
+            Dataset.from_list(shards).load_parquet().map(add_to_dup_map),
+        )
 
     return shard_dup_map
 
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -1546,16 +1546,21 @@ def execute(
                 job_name = f"zephyr-{self.name}-p{self._pipeline_id}-a{attempt}"
                 # The wrapper job just blocks on child actors; real
                 # resources are requested by the coordinator/worker children.
-                self._coordinator_job = self.client.submit(
-                    JobRequest(
-                        name=job_name,
-                        entrypoint=Entrypoint.from_callable(
-                            _run_coordinator_job,
-                            args=(config, result_path),
-                        ),
-                        resources=ResourceConfig(cpu=1, ram="1g"),
+                # Set the context var so the coordinator job inherits self.client
+                # instead of auto-detecting (which may pick a different backend).
+                from fray.v2.client import set_current_client
+
+                with set_current_client(self.client):
+                    self._coordinator_job = self.client.submit(
+                        JobRequest(
+                            name=job_name,
+                            entrypoint=Entrypoint.from_callable(
+                                _run_coordinator_job,
+                                args=(config, result_path),
+                            ),
+                            resources=ResourceConfig(cpu=1, ram="1g"),
+                        )
                     )
-                )
 
                 backoff.reset()
                 logger.info("Coordinator job submitted: %s (job_id=%s)", job_name, self._coordinator_job.job_id)