feat: add fuzzy deduplication post-processing

HarshaSatyavardhan · HarshaSatyavardhan · commit 9f4cb0b09eb0 · 2026-04-26T07:58:58.000+05:30
diff --git a/README.md b/README.md
@@ -173,6 +173,29 @@ Merge output behavior with multiple datasets:
 - Default (`run` with `execution_params.merge: true`, or `merge` without `--output-root`): each dataset is merged to its own `<dataset.output_dir>/merged`.
 - Shared root (`merge --output-root ...`): one merged subdirectory is created per dataset under the root.
 
+### Fuzzy deduplication (optional)
+
+After merging, MMIRAGE can drop near-duplicate rows using character n-gram MinHash + LSH. This is CPU-only and uses the lightweight `datasketch` package.
+
+Install the optional extra:
+
+```bash
+pip install -e '.[dedup]'
+```
+
+Enable in your YAML config:
+
+```yaml
+deduplication_params:
+  enabled: true
+  text_field: text
+  threshold: 0.85       # Jaccard similarity threshold
+  num_perm: 128         # MinHash signature size
+  shingle_size: 5       # character n-gram size
+```
+
+Dedup runs as part of `mmirage merge --config <cfg>` and as part of `mmirage run` when `execution_params.merge: true`. With `enabled: false` (default) the dedup module is not imported and there is no overhead.
+
 ### Multimodal: Processing images with VLMs
 
 MMIRAGE supports multimodal processing with vision-language models:
diff --git a/configs/config_comprehensive.yaml b/configs/config_comprehensive.yaml
@@ -185,6 +185,30 @@ execution_params:
   settle_time_seconds: 60
 
 
+# ============================================================================
+# DEDUPLICATION PARAMETERS (optional)
+# ============================================================================
+# Optional fuzzy deduplication applied after merging shards.
+# Uses character n-gram MinHash + LSH (via the `datasketch` package).
+# Install with: pip install -e '.[dedup]'
+
+deduplication_params:
+  # Set to true to enable fuzzy dedup; default false (no overhead).
+  enabled: false
+
+  # Column name to deduplicate on
+  text_field: text
+
+  # Jaccard similarity threshold above which rows are duplicates (0.0–1.0)
+  threshold: 0.85
+
+  # Number of MinHash permutations (signature size)
+  num_perm: 128
+
+  # Character n-gram size for shingling
+  shingle_size: 5
+
+
 # ============================================================================
 # USAGE EXAMPLES
 # ============================================================================
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,9 @@ dev = [
   "ipykernel",
   "pytest",
 ]
+dedup = [
+  "datasketch>=1.6.0",
+]
 
 [project.scripts]
 mmirage = "mmirage.cli:main"
diff --git a/src/mmirage/config/config.py b/src/mmirage/config/config.py
@@ -66,7 +66,9 @@ class ExecutionParams:
     def __post_init__(self):
         """Validate execution parameters."""
         if self.mode not in ("local", "slurm"):
-            raise ValueError(f"Invalid execution mode: {self.mode!r}. Must be 'local' or 'slurm'.")
+            raise ValueError(
+                f"Invalid execution mode: {self.mode!r}. Must be 'local' or 'slurm'."
+            )
         if self.mode == "slurm" and not self.account:
             raise ValueError("account is required when mode='slurm'")
         if self.max_retries < 0:
@@ -97,6 +99,25 @@ class ProcessingParams:
     remove_columns: bool = False
 
 
+@dataclass
+class DeduplicationParams:
+    """Configuration for fuzzy deduplication post-processing.
+
+    Attributes:
+        enabled: Whether deduplication is enabled. Defaults to False.
+        text_field: Column name containing text to deduplicate.
+        threshold: Jaccard similarity threshold above which rows are duplicates.
+        num_perm: Number of MinHash permutations (signature size).
+        shingle_size: Character n-gram size for shingling.
+    """
+
+    enabled: bool = False
+    text_field: str = "text"
+    threshold: float = 0.85
+    num_perm: int = 128
+    shingle_size: int = 5
+
+
 @dataclass
 class MMirageConfig:
     """Main configuration class for MMIRAGE pipeline.
@@ -110,9 +131,13 @@ class MMirageConfig:
         loading_params: Parameters for loading input datasets.
         processing_params: Parameters for processing dataset samples.
         execution_params: Parameters for executing the pipeline (local/SLURM).
+        deduplication_params: Parameters for post-merge fuzzy deduplication.
     """
 
     processors: List[BaseProcessorConfig]
     loading_params: LoadingParams
     processing_params: ProcessingParams
     execution_params: ExecutionParams = field(default_factory=ExecutionParams)
+    deduplication_params: DeduplicationParams = field(
+        default_factory=DeduplicationParams
+    )
diff --git a/src/mmirage/core/postprocess/__init__.py b/src/mmirage/core/postprocess/__init__.py
@@ -0,0 +1 @@
+"""Post-processing modules for MMIRAGE pipeline."""
diff --git a/src/mmirage/core/postprocess/fuzzy_dedup.py b/src/mmirage/core/postprocess/fuzzy_dedup.py
@@ -0,0 +1,88 @@
+"""Fuzzy deduplication for MMIRAGE datasets.
+
+Uses character n-gram MinHash + LSH to identify near-duplicate text samples
+and drop them in a streaming "first-seen wins" pass.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Iterable, List, Set
+
+from datasets import Dataset
+
+from mmirage.config.config import DeduplicationParams
+
+logger = logging.getLogger(__name__)
+
+
+def _check_dependencies() -> None:
+    try:
+        import datasketch  # noqa: F401
+    except ImportError as e:
+        raise ImportError(
+            "Deduplication requires `datasketch`. "
+            "Install with: pip install 'mmirage[dedup]'"
+        ) from e
+
+
+def _shingles(text: str, k: int) -> Set[bytes]:
+    text = " ".join(text.lower().split())
+    if len(text) < k:
+        return {text.encode("utf-8")}
+    return {text[i : i + k].encode("utf-8") for i in range(len(text) - k + 1)}
+
+
+def deduplicate(dataset: Dataset, params: DeduplicationParams) -> Dataset:
+    """Remove near-duplicate samples from a dataset using char-ngram MinHash + LSH.
+
+    Algorithm:
+    1. For each row, build the set of character n-grams of size `shingle_size`.
+    2. Compute a MinHash signature with `num_perm` permutations.
+    3. Query an LSH index built so far. If any near-duplicate is already
+       indexed (Jaccard similarity above `threshold`), drop this row.
+    4. Otherwise, insert the signature and keep the row.
+
+    Args:
+        dataset: HuggingFace Dataset to deduplicate.
+        params: Deduplication configuration parameters.
+
+    Returns:
+        Filtered Dataset with near-duplicates removed.
+    """
+    _check_dependencies()
+    from datasketch import MinHash, MinHashLSH
+
+    n = len(dataset)
+    if n <= 1:
+        logger.info("Dataset has %d row(s), skipping deduplication.", n)
+        return dataset
+
+    if params.text_field not in dataset.column_names:
+        raise ValueError(
+            f"Text field {params.text_field!r} not in dataset columns: "
+            f"{dataset.column_names}"
+        )
+
+    lsh = MinHashLSH(threshold=params.threshold, num_perm=params.num_perm)
+    keep: List[int] = []
+    texts: Iterable = dataset[params.text_field]
+
+    for i, raw in enumerate(texts):
+        text = raw if isinstance(raw, str) else str(raw)
+        m = MinHash(num_perm=params.num_perm)
+        for s in _shingles(text, params.shingle_size):
+            m.update(s)
+        if not lsh.query(m):
+            lsh.insert(str(i), m)
+            keep.append(i)
+
+    n_removed = n - len(keep)
+    logger.info(
+        "Fuzzy dedup: %d → %d rows (%d duplicates removed).",
+        n,
+        len(keep),
+        n_removed,
+    )
+
+    return dataset.select(keep)
diff --git a/src/mmirage/merge_shards.py b/src/mmirage/merge_shards.py
@@ -7,7 +7,7 @@
 
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
 
-from mmirage.config.config import MMirageConfig
+from mmirage.config.config import DeduplicationParams, MMirageConfig
 from mmirage.core.loader.base import DatasetLike
 from mmirage.shard_utils import (
     _count_rows,
@@ -52,6 +52,17 @@ def _merge_datasetdict(shard_dsets: List[DatasetDict]) -> DatasetDict:
     return DatasetDict(merged)
 
 
+def _apply_dedup(ds: DatasetLike, params: DeduplicationParams) -> DatasetLike:
+    """Apply fuzzy deduplication to a Dataset or each split of a DatasetDict."""
+    from mmirage.core.postprocess.fuzzy_dedup import deduplicate
+
+    if isinstance(ds, DatasetDict):
+        return DatasetDict(
+            {split: deduplicate(split_ds, params) for split, split_ds in ds.items()}
+        )
+    return deduplicate(ds, params)
+
+
 def _merge_shards(shard_dsets: List[DatasetLike]) -> DatasetLike:
     """Merge shard datasets into a single dataset."""
     if not shard_dsets:
@@ -67,12 +78,17 @@ def _merge_shards(shard_dsets: List[DatasetLike]) -> DatasetLike:
     )
 
 
-def merge_dataset_dir(dataset_dir: str, output_dir: str) -> MergeReport:
+def merge_dataset_dir(
+    dataset_dir: str,
+    output_dir: str,
+    dedup_params: Optional[DeduplicationParams] = None,
+) -> MergeReport:
     """Merge one dataset directory containing shard_* folders.
 
     Args:
         dataset_dir: Input directory containing shard_* folders.
         output_dir: Destination directory for merged dataset.
+        dedup_params: Optional fuzzy dedup config; applied before saving when enabled.
 
     Returns:
         MergeReport with summary details.
@@ -118,6 +134,16 @@ def merge_dataset_dir(dataset_dir: str, output_dir: str) -> MergeReport:
         )
 
     ds_merged = _merge_shards(shard_dsets)
+
+    if dedup_params is not None and dedup_params.enabled:
+        rows_before = _count_rows(ds_merged)
+        ds_merged = _apply_dedup(ds_merged, dedup_params)
+        rows_after = _count_rows(ds_merged)
+        logger.info(
+            f"Fuzzy dedup: {rows_before} → {rows_after} rows "
+            f"({rows_before - rows_after} duplicates removed)."
+        )
+
     merged_rows = _count_rows(ds_merged)
 
     _save_dataset_atomic(ds_merged, normalized_output_dir)
@@ -134,7 +160,11 @@ def merge_dataset_dir(dataset_dir: str, output_dir: str) -> MergeReport:
     )
 
 
-def merge_input_dir(input_dir: str, output_dir: str) -> List[MergeReport]:
+def merge_input_dir(
+    input_dir: str,
+    output_dir: str,
+    dedup_params: Optional[DeduplicationParams] = None,
+) -> List[MergeReport]:
     """Merge all shard datasets found under an input directory.
 
     The input can be either:
@@ -167,7 +197,7 @@ def merge_input_dir(input_dir: str, output_dir: str) -> List[MergeReport]:
             dataset_name = os.path.basename(dataset_dir)
             ds_output_dir = os.path.join(output_dir, dataset_name)
 
-        reports.append(merge_dataset_dir(dataset_dir, ds_output_dir))
+        reports.append(merge_dataset_dir(dataset_dir, ds_output_dir, dedup_params))
 
     return reports
 
@@ -210,7 +240,9 @@ def merge_from_config(
                 folder_name = f"{dataset_name}_{index}"
             output_dir = os.path.join(output_root, folder_name)
 
-        reports.append(merge_dataset_dir(dataset_dir, output_dir))
+        reports.append(
+            merge_dataset_dir(dataset_dir, output_dir, cfg.deduplication_params)
+        )
 
     return reports
 
@@ -232,6 +264,11 @@ def main():
         required=True,
         help="Directory to write merged datasets into.",
     )
+    ap.add_argument(
+        "--config",
+        default=None,
+        help="Optional MMIRAGE YAML config; enables fuzzy dedup if configured.",
+    )
     ap.add_argument(
         "--log-level",
         default="INFO",
@@ -241,7 +278,13 @@ def main():
     args = ap.parse_args()
     _configure_logging(args.log_level)
 
-    reports = merge_input_dir(args.input_dir, args.output_dir)
+    dedup_params: Optional[DeduplicationParams] = None
+    if args.config:
+        from mmirage.config.utils import load_mmirage_config
+
+        dedup_params = load_mmirage_config(args.config).deduplication_params
+
+    reports = merge_input_dir(args.input_dir, args.output_dir, dedup_params)
     for report in reports:
         skipped_total = report.skipped_invalid_dirs + report.skipped_zero_rows
         logger.info(
diff --git a/tests/test_dedup.py b/tests/test_dedup.py
@@ -0,0 +1,48 @@
+"""Smoke test for fuzzy deduplication on TinyStories."""
+
+import argparse
+import logging
+
+from datasets import load_dataset
+
+from mmirage.config.config import DeduplicationParams
+from mmirage.core.postprocess.fuzzy_dedup import deduplicate
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Optional row limit (default: full dataset).",
+    )
+    ap.add_argument("--threshold", type=float, default=0.85)
+    ap.add_argument("--num-perm", type=int, default=128)
+    ap.add_argument("--shingle-size", type=int, default=5)
+    args = ap.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    ds = load_dataset("roneneldan/TinyStories", split="train")
+    if args.limit is not None:
+        ds = ds.select(range(min(args.limit, len(ds))))
+    print(f"Loaded {len(ds):,} rows")
+
+    params = DeduplicationParams(
+        enabled=True,
+        text_field="text",
+        threshold=args.threshold,
+        num_perm=args.num_perm,
+        shingle_size=args.shingle_size,
+    )
+    deduped = deduplicate(ds, params)
+    removed = len(ds) - len(deduped)
+    print(
+        f"{len(ds):,} → {len(deduped):,} "
+        f"(removed {removed:,}, {removed / len(ds) * 100:.2f}%)"
+    )
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,9 @@ dev = [`
`48`	`48`	`"ipykernel",`
`49`	`49`	`"pytest",`
`50`	`50`	`]`
	`51`	`+dedup = [`
	`52`	`+ "datasketch>=1.6.0",`
	`53`	`+]`
`51`	`54`
`52`	`55`	`[project.scripts]`
`53`	`56`	`mmirage = "mmirage.cli:main"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Post-processing modules for MMIRAGE pipeline."""`