fuzzy_dups: add opt-in resume for CC iterations (#5135)

ravwojdyla-agent · ravwojdyla · claude · web-flow · commit 4693e5ec4f38 · 2026-04-23T13:38:12.000-07:00
connected_components(resume=True) scans output_dir for existing it_N/
parquet sets and, when it finds a complete iteration (file count matches
ctx.max_workers), skips the initial scatter plus prior iterations and
re-enters the Hash-to-Min loop at the next iteration. Falls back to a
full restart when no complete state is found. Plumbed through
compute_fuzzy_dups_attrs as cc_resume so callers can opt in without
touching the CC API directly.

Co-authored-by: Rafal Wojdyla &lt;ravwojdyla@gmail.com&gt;
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/marin/src/marin/processing/classification/deduplication/connected_components.py b/lib/marin/src/marin/processing/classification/deduplication/connected_components.py
@@ -8,9 +8,34 @@
 import dupekit
 from zephyr import Dataset, ZephyrContext, counters, write_parquet_file, ShardInfo
 
+from marin.utils import fsspec_glob
+
 logger = logging.getLogger(__name__)
 
 
+def _find_last_complete_iteration(
+    output_dir: str, max_iterations: int, expected_parquets: int
+) -> tuple[int, list[str]] | None:
+    """Return (last_iteration, parquet_paths) from prior run outputs, or None if nothing reusable.
+
+    A CC iteration ``it_N/`` is considered complete iff its parquet file count equals
+    ``expected_parquets`` (= ``ctx.max_workers`` at write time). Iteration 0 uses the
+    ``part-{shard:05d}.parquet`` naming; iterations 1+ use ``part-{shard:05d}-of-{total:05d}.parquet``.
+    Both are detected by globbing ``it_N/*.parquet``.
+    """
+    last_complete = -1
+    last_paths: list[str] = []
+    for i in range(max_iterations + 1):
+        paths = fsspec_glob(f"{output_dir}/it_{i}/*.parquet")
+        if len(paths) != expected_parquets:
+            break
+        last_complete = i
+        last_paths = paths
+    if last_complete < 0:
+        return None
+    return last_complete, last_paths
+
+
 # TODO (rav): can we have just a single id that's expected to be clean on the inputs?
 class RecordId(TypedDict):
     record_id: Any
@@ -55,6 +80,7 @@ def connected_components(
     output_dir: str,
     max_iterations: int = 10,
     preserve_singletons: bool = True,
+    resume: bool = False,
 ) -> tuple[bool, Sequence[str]]:
     """
     Connected Components implementation using Zephyr Dataset API and Hash-to-Min algorithm (https://arxiv.org/abs/1203.5387)
@@ -65,6 +91,9 @@ def connected_components(
         output_dir: Directory to write intermediate and final output files
         max_iterations: Maximum number of iterations to run the connected components algorithm
         preserve_singletons: Whether to preserve single-node buckets in the output
+        resume: If True, skip iterations whose ``it_N/`` already contains a complete set of
+            parquet files (count == ``ctx.max_workers``). Starts from the first incomplete
+            iteration. If no complete prior state exists, runs from scratch.
     """
 
     def _reduce_bucket_to_links(bucket: str, items: Iterator[CCInput]) -> Iterator[dict]:
@@ -124,25 +153,34 @@ def _dedup_combiner(bucket: str, items: Iterator[CCInput]) -> Iterator[CCInput]:
     # I/O amplification.
     num_reduce_shards = ctx.max_workers
 
-    curr_it = ctx.execute(
-        ds
-        # Group nodes in buckets, deduplicate, and emit pairwise links
-        .group_by(
-            lambda x: x["bucket"],
-            reducer=_reduce_bucket_to_links,
-            combiner=_dedup_combiner,
-            num_output_shards=num_reduce_shards,
-        )
-        # Construct Node state, init with:
-        #  * each node is its own component
-        #  * adjacency list from links
-        .group_by(
-            lambda x: x["source_id_norm"],
-            reducer=_build_adjacency,
-            num_output_shards=num_reduce_shards,
-        ).write_parquet(f"{output_dir}/it_0/part-{{shard:05d}}.parquet"),
-        verbose=True,
-    ).results
+    start_iteration = 1
+    curr_it: Sequence[str]
+    resumed = _find_last_complete_iteration(output_dir, max_iterations, num_reduce_shards) if resume else None
+    if resumed is not None:
+        last_it, last_paths = resumed
+        logger.info("CC resume: skipping through it_%d (%d parquets present)", last_it, len(last_paths))
+        curr_it = last_paths
+        start_iteration = last_it + 1
+    else:
+        curr_it = ctx.execute(
+            ds
+            # Group nodes in buckets, deduplicate, and emit pairwise links
+            .group_by(
+                lambda x: x["bucket"],
+                reducer=_reduce_bucket_to_links,
+                combiner=_dedup_combiner,
+                num_output_shards=num_reduce_shards,
+            )
+            # Construct Node state, init with:
+            #  * each node is its own component
+            #  * adjacency list from links
+            .group_by(
+                lambda x: x["source_id_norm"],
+                reducer=_build_adjacency,
+                num_output_shards=num_reduce_shards,
+            ).write_parquet(f"{output_dir}/it_0/part-{{shard:05d}}.parquet"),
+            verbose=True,
+        ).results
 
     def _get_write_shard_and_count_fn(iteration: int):
         # NOTE: this function exists to make the iteration number closure capture explicit
@@ -167,7 +205,7 @@ def counting_iter():
         return _write_shard_and_count
 
     converged = False
-    for i in range(1, max_iterations + 1):  # type: ignore[bad-assignment]
+    for i in range(start_iteration, max_iterations + 1):  # type: ignore[bad-assignment]
         logger.info(f"Connected components iteration {i}...")
 
         shard_results = ctx.execute(
diff --git a/lib/marin/src/marin/processing/classification/deduplication/fuzzy_dups.py b/lib/marin/src/marin/processing/classification/deduplication/fuzzy_dups.py
@@ -233,6 +233,7 @@ def compute_fuzzy_dups_attrs(
     inputs: list[MinHashAttrData],
     output_path: str,
     cc_max_iterations: int = 10,
+    cc_resume: bool = False,
     max_parallelism: int,
     worker_resources: ResourceConfig | None = None,
     coordinator_resources: ResourceConfig | None = None,
@@ -299,7 +300,11 @@ def compute_fuzzy_dups_attrs(
 
     bucket_ds = Dataset.from_list(entry_groups).flat_map(_emit_bucket_records)
     converged, cc_files = connected_components(
-        bucket_ds, ctx, output_dir=f"{output_path}/metadata/cc", max_iterations=cc_max_iterations
+        bucket_ds,
+        ctx,
+        output_dir=f"{output_path}/metadata/cc",
+        max_iterations=cc_max_iterations,
+        resume=cc_resume,
     )
     if not converged:
         # TODO (rav): log the number of changed nodes?