[datakit] Replace exact_dedup bool with DedupMode enum

ravwojdyla · ravwojdyla · commit ce16520c74d7 · 2026-04-15T19:14:50.000Z
Per review, switch from a boolean flag to a StrEnum (NONE, EXACT) so
future dedup variants (e.g. minhash, suffix-array) have a place to
land without another boolean explosion.
diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py
@@ -19,6 +19,7 @@
 from collections.abc import Callable, Iterator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
+from enum import StrEnum
 from typing import Any
 
 import dupekit
@@ -32,6 +33,17 @@
 logger = logging.getLogger(__name__)
 
 
+class DedupMode(StrEnum):
+    """How aggressively to deduplicate records during normalization.
+
+    ``EXACT`` drops records with duplicate ``id`` (i.e. byte-identical text)
+    within each output shard.  ``NONE`` skips the dedup pass entirely.
+    """
+
+    NONE = "none"
+    EXACT = "exact"
+
+
 @dataclass
 class NormalizeSubdirResult:
     """Per-subdirectory outcome of :func:`normalize_to_parquet`.
@@ -177,7 +189,7 @@ def _build_pipeline(
     num_shards: int,
     text_field: str,
     id_field: str | None,
-    exact_dedup: bool,
+    dedup_mode: DedupMode,
 ) -> Dataset:
     """Build a single Zephyr pipeline for one subdirectory."""
     normalize_record = _make_normalize_fn(text_field, id_field)
@@ -192,7 +204,7 @@ def dedup(_key: int, items: Iterator[dict[str, Any]]) -> Iterator[dict[str, Any]
                 yield record
 
     def passthrough(_key: int, items: Iterator[dict[str, Any]]) -> Iterator[dict[str, Any]]:
-        """Yield items unchanged; used when exact dedup is disabled."""
+        """Yield items unchanged; used when dedup is disabled."""
         yield from items
 
     def has_text(record: dict[str, Any]) -> bool:
@@ -202,14 +214,16 @@ def has_text(record: dict[str, Any]) -> bool:
             return False
         return True
 
+    reducers: dict[DedupMode, Callable] = {DedupMode.EXACT: dedup, DedupMode.NONE: passthrough}
+
     return (
         Dataset.from_list(files)
         .flat_map(load_file)
         .filter(has_text)
         .map(normalize_record)
         .group_by(
             key=lambda r: int(r["id"], 16) % num_shards,
-            reducer=dedup if exact_dedup else passthrough,
+            reducer=reducers[dedup_mode],
             sort_by=lambda r: r["id"],
             num_output_shards=num_shards,
         )
@@ -229,15 +243,15 @@ def normalize_to_parquet(
     target_partition_bytes: int = 256 * 1024 * 1024,
     worker_resources: ResourceConfig | None = None,
     file_extensions: tuple[str, ...] | None = None,
-    exact_dedup: bool = True,
+    dedup_mode: DedupMode = DedupMode.EXACT,
 ) -> NormalizeResult:
     """Normalize raw downloaded data to the datakit standard Parquet format.
 
     Discovers all data files under *input_path*, groups them by subdirectory,
     and launches one Zephyr pipeline per subdirectory concurrently.  Each
     pipeline normalizes records (``id``, ``text``, preserves all other columns),
-    optionally exact-deduplicates by content, sorts by ``id``, and writes
-    Parquet partitions sized by *target_partition_bytes*.
+    optionally deduplicates by content per *dedup_mode*, sorts by ``id``, and
+    writes Parquet partitions sized by *target_partition_bytes*.
 
     Args:
         input_path: Root directory containing raw downloaded data.
@@ -255,9 +269,10 @@ def normalize_to_parquet(
         file_extensions: Tuple of file extensions to include (e.g.
             ``(".parquet",)``).  Defaults to all extensions supported by
             ``zephyr.readers.load_file``.
-        exact_dedup: If True (the default), drop records with duplicate ``id``
-            values (i.e. exact text duplicates) within each output shard.  Set
-            to False to skip the dedup pass and preserve all input records.
+        dedup_mode: How to deduplicate records within each output shard.
+            ``EXACT`` (the default) drops records with duplicate ``id`` values
+            (i.e. byte-identical text).  ``NONE`` skips dedup and preserves
+            all input records.
 
     Returns:
         A :class:`NormalizeResult` describing the output files and zephyr
@@ -285,7 +300,7 @@ def _run_subdir(subdir: str, files: list[str]) -> NormalizeSubdirResult:
             num_shards,
         )
 
-        pipeline = _build_pipeline(files, output_dir, num_shards, text_field, id_field, exact_dedup)
+        pipeline = _build_pipeline(files, output_dir, num_shards, text_field, id_field, dedup_mode)
         ctx = ZephyrContext(
             name=f"normalize-{subdir.replace('/', '-') if subdir else 'all'}",
             resources=resources,
@@ -332,7 +347,7 @@ def normalize_step(
     override_output_path: str | None = None,
     input_path: str | None = None,
     file_extensions: tuple[str, ...] | None = None,
-    exact_dedup: bool = True,
+    dedup_mode: DedupMode = DedupMode.EXACT,
 ) -> StepSpec:
     """Create a StepSpec that normalizes downloaded data to Parquet.
 
@@ -350,8 +365,8 @@ def normalize_step(
         file_extensions: Tuple of file extensions to include (e.g.
             ``(".parquet",)``).  Defaults to all extensions supported by
             ``zephyr.readers.load_file``.
-        exact_dedup: If True (the default), drop records with duplicate ``id``
-            values (i.e. exact text duplicates) within each output shard.
+        dedup_mode: How to deduplicate records within each output shard.
+            Defaults to ``DedupMode.EXACT``; use ``DedupMode.NONE`` to skip.
     """
     resolved_input = input_path or download.output_path
 
@@ -365,7 +380,7 @@ def normalize_step(
             target_partition_bytes=target_partition_bytes,
             worker_resources=worker_resources,
             file_extensions=file_extensions,
-            exact_dedup=exact_dedup,
+            dedup_mode=dedup_mode,
         ),
         deps=[download],
         hash_attrs={
@@ -374,7 +389,7 @@ def normalize_step(
             "target_partition_bytes": target_partition_bytes,
             "input_path": resolved_input,
             "file_extensions": file_extensions,
-            "exact_dedup": exact_dedup,
+            "dedup_mode": dedup_mode,
         },
         override_output_path=override_output_path,
     )