[dedup] Replace side-effect mutation with reduce in fuzzy dedup map (#3938)

rjpower · yonromai · Helw150 · commit 2892e39d0227 · 2026-04-08T15:07:46.000-07:00
_load_fuzzy_dupe_map_shard built a dict by mutating a closure variable
via .map() side effect, which only works with LocalClient (shared
memory).
Replace with .reduce() so the dict flows through the Zephyr pipeline as
a
proper return value. Also removes a stale comment from
_compute_fuzzy_dedup_stats.

Co-authored-by: Romain Yon &lt;yonromai@users.noreply.github.com&gt;
diff --git a/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py b/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py
@@ -35,7 +35,6 @@ def _compute_fuzzy_dedup_stats(shards: list[str] | Sequence[str], method: str, l
         result: DupCounters = ctx.execute(  # type: ignore[bad-assignment]
             Dataset.from_list(shards)
             .load_parquet(columns=["component_id"])
-            # Compute the per-component statistics and then roll them up into a single counter group
             .group_by(
                 key=lambda r: r["component_id"],
                 reducer=lambda _, items: DupCounters(
@@ -56,19 +55,18 @@ def _load_fuzzy_dupe_map_shard(shards: list[str]) -> dict[str, bool]:
         logger.warning("No fuzzy duplicate documents found.")
         return {}
 
-    # Map record ID -> is duplicate (bool)
-    shard_dup_map = {}
-
-    def add_to_dup_map(record: dict):
-        shard_dup_map[record["id"]] = record["fuzzy_duplicate"]
-
     with log_time(f"Load fuzzy duplicate map from {len(shards)} shards"):
         ctx = ZephyrContext(client=LocalClient(), name="fuzzy-dup-map")
-        ctx.execute(
-            Dataset.from_list(shards).load_parquet().map(add_to_dup_map),
-        )
+        result: dict[str, bool] = ctx.execute(  # type: ignore[bad-assignment]
+            Dataset.from_list(shards)
+            .load_parquet()
+            .reduce(
+                local_reducer=lambda items: {r["id"]: r["fuzzy_duplicate"] for r in items},
+                global_reducer=lambda dicts: {k: v for d in dicts for k, v in d.items()},
+            ),
+        )[0]
 
-    return shard_dup_map
+    return result
 
 
 def dedup_fuzzy_document(