From 55fa9a969d956abd604c6036fb604472dd36212d Mon Sep 17 00:00:00 2001
From: cyruszhang <cyrus.ylzhang@gmail.com>
Date: Thu, 27 Feb 2025 10:11:57 -0800
Subject: [PATCH 1/3] add initial config for c4 refine

---
 tools/dedupe_suite/configs/c4-benchmark.yaml | 54 ++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 tools/dedupe_suite/configs/c4-benchmark.yaml

diff --git a/tools/dedupe_suite/configs/c4-benchmark.yaml b/tools/dedupe_suite/configs/c4-benchmark.yaml
new file mode 100644
index 0000000000..4fc482adae
--- /dev/null
+++ b/tools/dedupe_suite/configs/c4-benchmark.yaml
@@ -0,0 +1,54 @@
+# redpajama-c4-refine with more aggressive filtering and deduplication to produce benchmark dataset
+
+# global parameters
+project_name: 'Data-Juicer-recipes-c4'
+dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
+export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file
+
+np: 50  # number of subprocess to process your dataset
+open_tracer: True
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+  - clean_email_mapper:
+  - clean_links_mapper:
+  - fix_unicode_mapper:
+  - punctuation_normalization_mapper:
+  - whitespace_normalization_mapper:
+
+  - alphanumeric_filter:
+      tokenization: false
+      min_ratio: 0.65  # <3sigma (0.740)
+      max_ratio: 0.9   # >3sigma (0.867)
+  - average_line_length_filter:  # for code
+      max_len: 3000  # >3sigma (1277)
+  - character_repetition_filter:
+      rep_len: 10
+      max_ratio: 0.3  # >3sigma (0.168)
+  - language_id_score_filter:
+      min_score: 0.6
+  - maximum_line_length_filter:  # for code
+      max_len: 4000  # >3sigma (2017)
+  - perplexity_filter:
+      lang: en
+      max_ppl: 6000 #(>3sigma 4543)
+  - special_characters_filter:
+      max_ratio: 0.4  # > 3sigma (0.303)
+  - words_num_filter:
+      tokenization: true
+      min_num: 20
+      max_num: 10000
+  - word_repetition_filter:
+      lang: en
+      tokenization: true
+      rep_len: 10
+      max_ratio: 0.231  # 3sigma
+
+  - document_simhash_deduplicator:
+      tokenization: space
+      window_size: 6
+      lowercase: true
+      ignore_pattern: '\p{P}'
+      num_blocks: 6
+      hamming_distance: 4

From 6f8bdede7af3b43af64733838c58ad036d3fd55c Mon Sep 17 00:00:00 2001
From: cyruszhang <cyrus.ylzhang@gmail.com>
Date: Tue, 4 Mar 2025 13:44:27 -0800
Subject: [PATCH 2/3] add dupe gen version 0.1

---
 tests/ops/data/img1_dup.png                   |   1 +
 tests/ops/data/img2_dup.jpg                   |   1 +
 tests/ops/data/img3_dup.jpg                   |   1 +
 tests/ops/data/img3_dup_dup.jpg               |   1 +
 tests/ops/data/video1_dup.mp4                 |   1 +
 tests/ops/data/video2_dup.mp4                 |   1 +
 tests/ops/data/video3_dup.mp4                 |   1 +
 tests/ops/data/video3_dup_dup.mp4             |   1 +
 .../{c4-benchmark.yaml => c4-baseline.yaml}   |   0
 tools/dedupe_suite/dupe_gen/app.py            |  91 +++++
 tools/dedupe_suite/dupe_gen/distributor.py    | 135 +++++++
 tools/dedupe_suite/dupe_gen/generator.py      | 345 ++++++++++++++++++
 tools/dedupe_suite/dupe_gen/modifier.py       | 180 +++++++++
 13 files changed, 759 insertions(+)
 create mode 120000 tests/ops/data/img1_dup.png
 create mode 120000 tests/ops/data/img2_dup.jpg
 create mode 120000 tests/ops/data/img3_dup.jpg
 create mode 120000 tests/ops/data/img3_dup_dup.jpg
 create mode 120000 tests/ops/data/video1_dup.mp4
 create mode 120000 tests/ops/data/video2_dup.mp4
 create mode 120000 tests/ops/data/video3_dup.mp4
 create mode 120000 tests/ops/data/video3_dup_dup.mp4
 rename tools/dedupe_suite/configs/{c4-benchmark.yaml => c4-baseline.yaml} (100%)
 create mode 100644 tools/dedupe_suite/dupe_gen/app.py
 create mode 100644 tools/dedupe_suite/dupe_gen/distributor.py
 create mode 100644 tools/dedupe_suite/dupe_gen/generator.py
 create mode 100644 tools/dedupe_suite/dupe_gen/modifier.py

diff --git a/tests/ops/data/img1_dup.png b/tests/ops/data/img1_dup.png
new file mode 120000
index 0000000000..d62a859006
--- /dev/null
+++ b/tests/ops/data/img1_dup.png
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img1.png
\ No newline at end of file
diff --git a/tests/ops/data/img2_dup.jpg b/tests/ops/data/img2_dup.jpg
new file mode 120000
index 0000000000..8a99a25260
--- /dev/null
+++ b/tests/ops/data/img2_dup.jpg
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img2.jpg
\ No newline at end of file
diff --git a/tests/ops/data/img3_dup.jpg b/tests/ops/data/img3_dup.jpg
new file mode 120000
index 0000000000..6e8c435e31
--- /dev/null
+++ b/tests/ops/data/img3_dup.jpg
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3.jpg
\ No newline at end of file
diff --git a/tests/ops/data/img3_dup_dup.jpg b/tests/ops/data/img3_dup_dup.jpg
new file mode 120000
index 0000000000..f539c0972a
--- /dev/null
+++ b/tests/ops/data/img3_dup_dup.jpg
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3_dup.jpg
\ No newline at end of file
diff --git a/tests/ops/data/video1_dup.mp4 b/tests/ops/data/video1_dup.mp4
new file mode 120000
index 0000000000..6d1bbbc84b
--- /dev/null
+++ b/tests/ops/data/video1_dup.mp4
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video1.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video2_dup.mp4 b/tests/ops/data/video2_dup.mp4
new file mode 120000
index 0000000000..8fa6335be5
--- /dev/null
+++ b/tests/ops/data/video2_dup.mp4
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video2.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video3_dup.mp4 b/tests/ops/data/video3_dup.mp4
new file mode 120000
index 0000000000..f631588609
--- /dev/null
+++ b/tests/ops/data/video3_dup.mp4
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video3_dup_dup.mp4 b/tests/ops/data/video3_dup_dup.mp4
new file mode 120000
index 0000000000..6a225ba396
--- /dev/null
+++ b/tests/ops/data/video3_dup_dup.mp4
@@ -0,0 +1 @@
+/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3_dup.mp4
\ No newline at end of file
diff --git a/tools/dedupe_suite/configs/c4-benchmark.yaml b/tools/dedupe_suite/configs/c4-baseline.yaml
similarity index 100%
rename from tools/dedupe_suite/configs/c4-benchmark.yaml
rename to tools/dedupe_suite/configs/c4-baseline.yaml
diff --git a/tools/dedupe_suite/dupe_gen/app.py b/tools/dedupe_suite/dupe_gen/app.py
new file mode 100644
index 0000000000..f5de54e5d9
--- /dev/null
+++ b/tools/dedupe_suite/dupe_gen/app.py
@@ -0,0 +1,91 @@
+# Example usage
+from .generator import DuplicateGenerator, DuplicationConfig
+
+
+def generate_benchmarks(base_dataset_path='data/c4_sample.jsonl'):
+    """Generate benchmark datasets with various duplication configurations
+
+    Args:
+        base_dataset_path: Path to the base dataset file (JSONL format)
+    """
+    # 1. Basic usage with default settings
+    generator = DuplicateGenerator()
+    stats = generator.generate_from_dataset(
+        dataset_path=base_dataset_path,
+        output_path='output_with_duplicates.jsonl')
+
+    # 2. Clustered duplicates with specific ratio
+    #    clustered_config = DuplicationConfig(ratio=0.3,
+    #                                         distribution='clustered',
+    #                                         cluster_size=5,
+    #                                         types={
+    #                                             'exact': 0.2,
+    #                                             'near': 0.6,
+    #                                             'far': 0.2
+    #                                         })
+
+    # clustered_generator = DuplicateGenerator(clustered_config)
+    # clustered_stats = clustered_generator.generate_from_dataset(
+    #    dataset_path=base_dataset_path,
+    #     output_path='output_clustered_duplicates.jsonl')
+
+    # 3. High duplication rate with mostly near-duplicates
+    # high_dup_config = DuplicationConfig(
+    #    ratio=0.7,
+    #    types={
+    #        'exact': 0.1,
+    #        'near': 0.8,
+    #        'far': 0.1
+    #    },
+    #    modification_levels={
+    #        'near': 0.05,
+    #        'far': 0.2
+    #    }  # Very subtle near-duplicates
+    #    )
+
+    # high_dup_generator = DuplicateGenerator(high_dup_config)
+    # high_dup_stats = high_dup_generator.generate_from_dataset(
+    #    dataset_path=base_dataset_path,
+    #    output_path='output_high_duplication.jsonl')
+
+    # 4. Generate benchmarks of different sizes
+    for size_name, num_docs in [('small', 10000), ('medium', 100000),
+                                ('large', 1000000)]:
+        # Create sample of appropriate size
+        sample_path = f'sample_{size_name}.jsonl'
+        with open(sample_path, 'w') as outfile:
+            with open(base_dataset_path, 'r') as infile:
+                for i, line in enumerate(infile):
+                    if i >= num_docs:
+                        break
+                    outfile.write(line)
+
+        # Generate duplicates with different configurations
+        for dup_rate in [0.1, 0.3, 0.5]:
+            for dist in ['random', 'clustered']:
+                config = DuplicationConfig(ratio=dup_rate, distribution=dist)
+                generator = DuplicateGenerator(config)
+                fn = f'output_{size_name}_{dist}_{int(dup_rate*100)}pct_dups.jsonl'  # noqa
+                stats = generator.generate_from_dataset(
+                    dataset_path=sample_path, output_path=fn)
+
+                print(f'Generated {size_name} benchmark '
+                      f'with {dist} distribution '
+                      f'and {dup_rate*100}% duplicates')
+                print(f'Stats: {stats}')
+
+
+# If this file is run directly
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark datasets with controlled duplication')
+    parser.add_argument('--dataset',
+                        type=str,
+                        default='data/c4_sample.jsonl',
+                        help='Path to the base dataset file (JSONL format)')
+
+    args = parser.parse_args()
+
+    generate_benchmarks(args.dataset)
diff --git a/tools/dedupe_suite/dupe_gen/distributor.py b/tools/dedupe_suite/dupe_gen/distributor.py
new file mode 100644
index 0000000000..ea37abc2c7
--- /dev/null
+++ b/tools/dedupe_suite/dupe_gen/distributor.py
@@ -0,0 +1,135 @@
+# dupgen/distributors.py
+import random
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+
+class DuplicateDistributor:
+    """Controls how duplicates are distributed in the dataset"""
+
+    @staticmethod
+    def random_distribution(originals: List[Dict],
+                            num_duplicates: int) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates randomly across originals
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+            to create
+        """
+        distribution = []
+
+        # Simple random selection with replacement
+        for _ in range(num_duplicates):
+            original = random.choice(originals)
+            distribution.append((original, 1))
+
+        return distribution
+
+    @staticmethod
+    def clustered_distribution(
+            originals: List[Dict],
+            num_duplicates: int,
+            avg_cluster_size: float = 5.0,
+            variance: float = 0.5) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates in clusters
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+            avg_cluster_size: Average number of duplicates per original
+            variance: Variance in cluster sizes (0-1, higher means more
+            variance)
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+              to create
+        """
+        # Determine how many originals to use
+        num_clusters = max(1, int(num_duplicates / avg_cluster_size))
+
+        # Select originals to duplicate
+        selected_originals = random.sample(originals,
+                                           k=min(num_clusters, len(originals)))
+
+        # Generate cluster sizes following a power law distribution
+        alpha = 2.0  # Power law exponent (adjust for different distributions)
+        sizes = np.random.power(
+            alpha, size=len(selected_originals)) * avg_cluster_size * 2
+        sizes = np.maximum(sizes, 1)  # Ensure at least size 1
+
+        # Apply variance
+        if variance > 0:
+            # Add noise proportional to variance
+            noise = np.random.normal(0,
+                                     variance * avg_cluster_size,
+                                     size=len(sizes))
+            sizes = np.maximum(sizes + noise, 1)
+
+        # Convert to integers
+        sizes = sizes.astype(int)
+
+        # Adjust to match total required duplicates
+        total = sum(sizes)
+        if total > num_duplicates:
+            # Scale down
+            sizes = np.floor(sizes * (num_duplicates / total)).astype(int)
+            # Distribute remaining
+            remainder = num_duplicates - sum(sizes)
+            for i in range(remainder):
+                sizes[i % len(sizes)] += 1
+        elif total < num_duplicates:
+            # Scale up
+            deficit = num_duplicates - total
+            # Distribute deficit
+            for i in range(deficit):
+                sizes[i % len(sizes)] += 1
+
+        # Create distribution
+        distribution = [(original, int(size))
+                        for original, size in zip(selected_originals, sizes)]
+
+        return distribution
+
+    @staticmethod
+    def power_law_distribution(
+            originals: List[Dict],
+            num_duplicates: int,
+            exponent: float = 2.0) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates following a power law (few originals get
+        many duplicates)
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+            exponent: Power law exponent (higher means more skewed
+              distribution)
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+            to create
+        """
+        # Select a subset of originals to duplicate
+        num_to_duplicate = min(len(originals), max(1,
+                                                   int(len(originals) * 0.1)))
+        selected_originals = random.sample(originals, k=num_to_duplicate)
+
+        # Generate power law weights
+        weights = np.power(np.arange(1, num_to_duplicate + 1), -exponent)
+        weights = weights / np.sum(weights)  # Normalize
+
+        # Distribute duplicates according to weights
+        counts = np.zeros(num_to_duplicate, dtype=int)
+        for _ in range(num_duplicates):
+            idx = np.random.choice(num_to_duplicate, p=weights)
+            counts[idx] += 1
+
+        # Create distribution
+        distribution = [(original, int(count))
+                        for original, count in zip(selected_originals, counts)]
+
+        return distribution
diff --git a/tools/dedupe_suite/dupe_gen/generator.py b/tools/dedupe_suite/dupe_gen/generator.py
new file mode 100644
index 0000000000..6d7ec4763f
--- /dev/null
+++ b/tools/dedupe_suite/dupe_gen/generator.py
@@ -0,0 +1,345 @@
+import hashlib
+import json
+import multiprocessing as mp
+import os
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+from tqdm import tqdm
+
+from .distributor import DuplicateDistributor
+from .modifier import ModificationStrategy
+
+
+@dataclass
+class DuplicationConfig:
+    """Configuration for duplication generation"""
+    ratio: float = 0.3  # Percentage of duplicates in final dataset
+    types: Dict[
+        str,
+        float] = None  # Distribution of duplicate types (exact, near, far)
+    distribution: str = 'random'  # How duplicates are distributed
+    cluster_size: int = 5  # Average size of duplicate clusters
+    cluster_variance: float = 0.5  # Variance in cluster sizes
+    modification_levels: Dict[str,
+                              float] = None  # How much to modify for each type
+    cross_source: bool = False  # Whether to create duplicates across sources
+
+    def __post_init__(self):
+        # Default duplicate type distribution
+        if self.types is None:
+            self.types = {'exact': 0.2, 'near': 0.5, 'far': 0.3}
+
+        # Default modification levels
+        if self.modification_levels is None:
+            self.modification_levels = {
+                'near': 0.1,  # 10% modification
+                'far': 0.3  # 30% modification
+            }
+
+        # Validate configuration
+        assert sum(self.types.values()
+                   ) == 1.0, 'Duplicate type probabilities must sum to 1.0'
+        assert 0 <= self.ratio <= 0.9, 'Dup ratio must be between 0 and 0.9'
+
+
+class DuplicateGenerator:
+    """Main class for generating controlled duplicates"""
+
+    def __init__(self, config: DuplicationConfig = None):
+        self.config = config or DuplicationConfig()
+
+        # Initialize modification strategy
+        self.modification_strategy = ModificationStrategy()
+
+        # Use the distributor class instead of inline distribution logic
+        self.distributor = DuplicateDistributor()
+
+        # Map duplicate types to modifier methods
+        self.modifiers = {
+            'exact': self._modify_exact,
+            'near': self._modify_near,
+            'far': self._modify_far
+        }
+
+    def generate_from_dataset(self,
+                              dataset_path: str,
+                              output_path: str,
+                              text_field: str = 'text',
+                              id_field: str = 'id',
+                              chunk_size: int = 100000,
+                              num_processes: int = None) -> Dict[str, Any]:
+        """Generate duplicates from an existing dataset file"""
+
+        # Determine number of processes
+        if num_processes is None:
+            num_processes = max(1, mp.cpu_count() - 1)
+
+        # Count lines in file to determine total size
+        total_lines = sum(1 for _ in open(dataset_path, 'r'))
+
+        # Calculate number of chunks
+        num_chunks = (total_lines + chunk_size - 1) // chunk_size
+
+        # Process in chunks
+        results = []
+        for chunk_idx in range(num_chunks):
+            print(f'Processing chunk {chunk_idx+1}/{num_chunks}')
+
+            # Read chunk of documents
+            start_line = chunk_idx * chunk_size
+            end_line = min(start_line + chunk_size, total_lines)
+
+            documents = []
+            with open(dataset_path, 'r') as f:
+                for i, line in enumerate(f):
+                    if i < start_line:
+                        continue
+                    if i >= end_line:
+                        break
+                    try:
+                        doc = json.loads(line)
+                        documents.append(doc)
+                    except json.JSONDecodeError:
+                        continue
+
+            # Generate duplicates for this chunk
+            chunk_result = self._process_chunk(
+                documents,
+                output_path=f'{output_path}.chunk{chunk_idx}',
+                text_field=text_field,
+                id_field=id_field,
+                num_processes=num_processes)
+
+            results.append(chunk_result)
+
+        # Combine chunks if needed
+        if num_chunks > 1:
+            self._combine_chunks(output_path, num_chunks)
+
+        # Aggregate statistics
+        stats = self._aggregate_stats(results)
+
+        # Save statistics
+        with open(f'{output_path}.stats.json', 'w') as f:
+            json.dump(stats, f, indent=2)
+
+        return stats
+
+    def _process_chunk(self, documents: List[Dict[str, Any]], output_path: str,
+                       text_field: str, id_field: str,
+                       num_processes: int) -> Dict[str, Any]:
+        """Process a chunk of documents to generate duplicates"""
+
+        # Calculate how many duplicates to create
+        num_originals = len(documents)
+        num_duplicates = int(num_originals * self.config.ratio /
+                             (1 - self.config.ratio))
+
+        # Use the distributor class instead of inline distribution logic
+        if self.config.distribution == 'clustered':
+            distribution = self.distributor.clustered_distribution(
+                documents,
+                num_duplicates,
+                avg_cluster_size=self.config.cluster_size,
+                variance=self.config.cluster_variance)
+        elif self.config.distribution == 'power_law':
+            distribution = self.distributor.power_law_distribution(
+                documents,
+                num_duplicates,
+                exponent=2.0  # Could be configurable
+            )
+        else:  # "random"
+            distribution = self.distributor.random_distribution(
+                documents, num_duplicates)
+
+        # Process distribution to create duplicates
+        duplicates = []
+
+        # Prepare arguments for parallel processing
+        args = []
+        for original, count in distribution:
+            args.append((original, count, text_field, id_field))
+
+        # Process in parallel
+        with mp.Pool(num_processes) as pool:
+            duplicate_clusters = list(
+                tqdm(pool.imap(self._generate_cluster, args),
+                     total=len(args),
+                     desc='Generating duplicate clusters'))
+
+        # Flatten clusters
+        for cluster in duplicate_clusters:
+            duplicates.extend(cluster)
+
+        # Add metadata to original documents
+        for doc in documents:
+            if 'is_duplicate' not in doc:
+                doc['is_duplicate'] = False
+                doc['original_id'] = None
+                doc['duplicate_type'] = None
+
+        # Combine and shuffle
+        all_documents = documents + duplicates
+        random.shuffle(all_documents)
+
+        # Write to output file
+        with open(output_path, 'w') as f:
+            for doc in all_documents:
+                f.write(json.dumps(doc) + '\n')
+
+        # Return statistics
+        return {
+            'total_documents': len(all_documents),
+            'original_documents': len(documents),
+            'duplicate_documents': len(duplicates),
+            'duplication_ratio': len(duplicates) / len(all_documents),
+            'duplication_types': {
+                dup_type: sum(1 for d in duplicates
+                              if d.get('duplicate_type') == dup_type)
+                for dup_type in self.config.types.keys()
+            }
+        }
+
+    def _generate_cluster(self, args):
+        """Generate a cluster of duplicates from a single original"""
+        original, cluster_size, text_field, id_field = args
+
+        duplicates = []
+        for _ in range(cluster_size):
+            dup_type = self._select_duplicate_type()
+            duplicate = self._generate_duplicate(
+                (original, dup_type, text_field, id_field))
+            duplicates.append(duplicate)
+
+        return duplicates
+
+    def _generate_duplicate(self, args):
+        """Generate a single duplicate from an original document"""
+        original, dup_type, text_field, id_field = args
+
+        # Create base duplicate
+        duplicate = original.copy()
+
+        # Add metadata
+        duplicate['is_duplicate'] = True
+        duplicate['original_id'] = original.get(id_field, 'unknown')
+        duplicate['duplicate_type'] = dup_type
+
+        # Generate new ID
+        duplicate[id_field] = hashlib.md5(
+            (str(original.get(id_field, '')) +
+             str(random.random())).encode()).hexdigest()
+
+        # Apply modifications based on duplicate type
+        if text_field in duplicate:
+            modifier_func = self.modifiers.get(dup_type, self._modify_exact)
+            duplicate[text_field] = modifier_func(duplicate[text_field])
+
+        return duplicate
+
+    def _select_duplicate_type(self):
+        """Select a duplicate type based on configured probabilities"""
+        types, probs = zip(*self.config.types.items())
+        return random.choices(types, weights=probs, k=1)[0]
+
+    def _modify_exact(self, text):
+        """No modification for exact duplicates"""
+        return text
+
+    def _modify_near(self, text):
+        """Apply near-duplicate modifications using ModificationStrategy"""
+        # Get modification intensity from config
+        intensity = self.config.modification_levels.get('near', 0.1)
+
+        # Use the ModificationStrategy to apply various modifications
+        return self.modification_strategy.apply(text, intensity=intensity)
+
+    def _modify_far(self, text):
+        """Apply far-duplicate modifications using ModificationStrategy"""
+        # Get modification intensity from config
+        intensity = self.config.modification_levels.get('far', 0.3)
+
+        # For far duplicates, we might want to apply more aggressive m
+        # odifications
+        # First shuffle paragraphs
+        paragraphs = text.split('\n')
+        if len(paragraphs) > 1:
+            random.shuffle(paragraphs)
+
+        # Then apply modifications to each paragraph
+        modified_paragraphs = []
+        for paragraph in paragraphs:
+            # Apply the modification strategy with higher intensity
+            modified = self.modification_strategy.apply(paragraph,
+                                                        intensity=intensity)
+            modified_paragraphs.append(modified)
+
+        return '\n'.join(modified_paragraphs)
+
+    def _get_similar_word(self, word):
+        """Generate a similar word (placeholder implementation)"""
+        # In a real implementation, you might use:
+        # - Word embeddings to find semantically similar words
+        # - Character-level modifications
+        # - Synonym lookup
+
+        # Simple implementation: modify the word slightly
+        if len(word) <= 3:
+            return word
+
+        if random.random() < 0.5:
+            # Change a character
+            pos = random.randint(0, len(word) - 1)
+            chars = list(word)
+            chars[pos] = random.choice('abcdefghijklmnopqrstuvwxyz')
+            return ''.join(chars)
+        else:
+            # Add or remove a character
+            if random.random() < 0.5 and len(word) > 3:
+                # Remove
+                pos = random.randint(0, len(word) - 1)
+                return word[:pos] + word[pos + 1:]
+            else:
+                # Add
+                pos = random.randint(0, len(word))
+                char = random.choice('abcdefghijklmnopqrstuvwxyz')
+                return word[:pos] + char + word[pos:]
+
+    def _combine_chunks(self, output_path, num_chunks):
+        """Combine chunk files into a single output file"""
+        with open(output_path, 'w') as outfile:
+            for chunk_idx in range(num_chunks):
+                chunk_path = f'{output_path}.chunk{chunk_idx}'
+                with open(chunk_path, 'r') as infile:
+                    for line in infile:
+                        outfile.write(line)
+
+                # Remove chunk file
+                os.remove(chunk_path)
+
+    def _aggregate_stats(self, chunk_stats):
+        """Aggregate statistics from multiple chunks"""
+        total_stats = {
+            'total_documents': 0,
+            'original_documents': 0,
+            'duplicate_documents': 0,
+            'duplication_types':
+            {dup_type: 0
+             for dup_type in self.config.types.keys()}
+        }
+
+        for stats in chunk_stats:
+            total_stats['total_documents'] += stats['total_documents']
+            total_stats['original_documents'] += stats['original_documents']
+            total_stats['duplicate_documents'] += stats['duplicate_documents']
+
+            for dup_type, count in stats['duplication_types'].items():
+                total_stats['duplication_types'][dup_type] += count
+
+        total_stats['duplication_ratio'] = (
+            total_stats['duplicate_documents'] / total_stats['total_documents']
+            if total_stats['total_documents'] > 0 else 0)
+
+        return total_stats
diff --git a/tools/dedupe_suite/dupe_gen/modifier.py b/tools/dedupe_suite/dupe_gen/modifier.py
new file mode 100644
index 0000000000..4cdfea3618
--- /dev/null
+++ b/tools/dedupe_suite/dupe_gen/modifier.py
@@ -0,0 +1,180 @@
+import random
+import re
+import string
+from typing import Callable, Dict
+
+
+class TextModifier:
+    """Advanced text modification strategies for creating near-duplicates"""
+
+    @staticmethod
+    def character_swap(text: str, rate: float = 0.05) -> str:
+        """Swap characters randomly"""
+        chars = list(text)
+        swaps = max(1, int(len(chars) * rate))
+
+        for _ in range(swaps):
+            if len(chars) < 2:
+                break
+            i = random.randint(0, len(chars) - 2)
+            chars[i], chars[i + 1] = chars[i + 1], chars[i]
+
+        return ''.join(chars)
+
+    @staticmethod
+    def word_replacement(text: str, rate: float = 0.1) -> str:
+        """Replace words with similar ones"""
+        words = text.split()
+        replacements = max(1, int(len(words) * rate))
+
+        for _ in range(replacements):
+            if not words:
+                break
+            i = random.randint(0, len(words) - 1)
+
+            # Simple replacement strategy (could be enhanced with word
+            # embeddings)
+            if len(words[i]) > 3:
+                # Replace with a slightly modified version
+                chars = list(words[i])
+                pos = random.randint(0, len(chars) - 1)
+                chars[pos] = random.choice(string.ascii_lowercase)
+                words[i] = ''.join(chars)
+
+        return ' '.join(words)
+
+    @staticmethod
+    def sentence_reordering(text: str, rate: float = 0.3) -> str:
+        """Reorder sentences within paragraphs"""
+        paragraphs = text.split('\n')
+
+        for i, paragraph in enumerate(paragraphs):
+            # Split into sentences (simple approach)
+            sentences = re.split(r'(?<=[.!?])\s+', paragraph)
+            if len(sentences) > 1:
+                # Shuffle some sentences
+                num_to_shuffle = max(1, int(len(sentences) * rate))
+                indices = random.sample(range(len(sentences)),
+                                        k=num_to_shuffle)
+
+                # Extract sentences to shuffle
+                to_shuffle = [sentences[j] for j in sorted(indices)]
+                random.shuffle(to_shuffle)
+
+                # Put back in original positions
+                for idx, j in enumerate(sorted(indices)):
+                    sentences[j] = to_shuffle[idx]
+
+                paragraphs[i] = ' '.join(sentences)
+
+        return '\n'.join(paragraphs)
+
+    @staticmethod
+    def html_modification(text: str, rate: float = 0.2) -> str:
+        """Modify HTML attributes while preserving structure"""
+
+        # This is a simplified version - a real implementation would
+        # use an HTML parser
+
+        # Modify attributes in tags
+        def replace_attr(match):
+            tag = match.group(1)
+            attrs = match.group(2)
+
+            # Randomly modify some attributes
+            if random.random() < rate:
+                # Add a random attribute
+                attrs += f' data-random="{random.randint(1, 1000)}"'
+
+            return f'<{tag} {attrs}>'
+
+        modified = re.sub(r'<(\w+)\s+([^>]+)>', replace_attr, text)
+        return modified
+
+    @staticmethod
+    def whitespace_modification(text: str) -> str:
+        """Modify whitespace without changing content"""
+        # Replace multiple spaces with single space
+        modified = re.sub(r'\s+', ' ', text)
+
+        # Randomly add extra newlines
+        sentences = re.split(r'(?<=[.!?])\s+', modified)
+        for i in range(len(sentences) - 1):
+            if random.random() < 0.2:
+                sentences[i] = sentences[i] + '\n'
+
+        return ' '.join(sentences)
+
+    @staticmethod
+    def case_modification(text: str, rate: float = 0.1) -> str:
+        """Change case of some words"""
+        words = text.split()
+        modifications = max(1, int(len(words) * rate))
+
+        for _ in range(modifications):
+            if not words:
+                break
+            i = random.randint(0, len(words) - 1)
+
+            # Skip very short words
+            if len(words[i]) < 3:
+                continue
+
+            # Apply case modification
+            mod_type = random.choice(['upper', 'lower', 'title'])
+            if mod_type == 'upper':
+                words[i] = words[i].upper()
+            elif mod_type == 'lower':
+                words[i] = words[i].lower()
+            else:
+                words[i] = words[i].title()
+
+        return ' '.join(words)
+
+
+class ModificationStrategy:
+    """Combines multiple modification strategies with configurable weights"""
+
+    def __init__(self, strategies: Dict[Callable, float] = None):
+        """Initialize with strategies and their weights
+
+        Args:
+            strategies: Dictionary mapping strategy functions to their weights
+        """
+        if strategies is None:
+            # Default strategies and weights
+            self.strategies = {
+                TextModifier.character_swap: 0.2,
+                TextModifier.word_replacement: 0.3,
+                TextModifier.sentence_reordering: 0.2,
+                TextModifier.whitespace_modification: 0.1,
+                TextModifier.case_modification: 0.2
+            }
+        else:
+            self.strategies = strategies
+
+    def apply(self, text: str, intensity: float = 0.5) -> str:
+        """Apply modification strategies based on weights and intensity
+
+        Args:
+            text: Text to modify
+            intensity: Overall modification intensity (0.0 to 1.0)
+
+        Returns:
+            Modified text
+        """
+        modified = text
+
+        # Normalize weights
+        total_weight = sum(self.strategies.values())
+        normalized_weights = {
+            k: v / total_weight
+            for k, v in self.strategies.items()
+        }
+
+        # Apply strategies based on weights and intensity
+        for strategy, weight in normalized_weights.items():
+            if random.random() < weight * intensity:
+                modified = strategy(modified)
+
+        return modified

From fa4ac51a0e987f26f6be173c959f8c9ee69786f7 Mon Sep 17 00:00:00 2001
From: cyruszhang <cyrus.ylzhang@gmail.com>
Date: Tue, 4 Mar 2025 13:48:04 -0800
Subject: [PATCH 3/3] remove dup files

---
 tests/ops/data/img1_dup.png       | 1 -
 tests/ops/data/img2_dup.jpg       | 1 -
 tests/ops/data/img3_dup.jpg       | 1 -
 tests/ops/data/img3_dup_dup.jpg   | 1 -
 tests/ops/data/video1_dup.mp4     | 1 -
 tests/ops/data/video2_dup.mp4     | 1 -
 tests/ops/data/video3_dup.mp4     | 1 -
 tests/ops/data/video3_dup_dup.mp4 | 1 -
 8 files changed, 8 deletions(-)
 delete mode 120000 tests/ops/data/img1_dup.png
 delete mode 120000 tests/ops/data/img2_dup.jpg
 delete mode 120000 tests/ops/data/img3_dup.jpg
 delete mode 120000 tests/ops/data/img3_dup_dup.jpg
 delete mode 120000 tests/ops/data/video1_dup.mp4
 delete mode 120000 tests/ops/data/video2_dup.mp4
 delete mode 120000 tests/ops/data/video3_dup.mp4
 delete mode 120000 tests/ops/data/video3_dup_dup.mp4

diff --git a/tests/ops/data/img1_dup.png b/tests/ops/data/img1_dup.png
deleted file mode 120000
index d62a859006..0000000000
--- a/tests/ops/data/img1_dup.png
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img1.png
\ No newline at end of file
diff --git a/tests/ops/data/img2_dup.jpg b/tests/ops/data/img2_dup.jpg
deleted file mode 120000
index 8a99a25260..0000000000
--- a/tests/ops/data/img2_dup.jpg
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img2.jpg
\ No newline at end of file
diff --git a/tests/ops/data/img3_dup.jpg b/tests/ops/data/img3_dup.jpg
deleted file mode 120000
index 6e8c435e31..0000000000
--- a/tests/ops/data/img3_dup.jpg
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3.jpg
\ No newline at end of file
diff --git a/tests/ops/data/img3_dup_dup.jpg b/tests/ops/data/img3_dup_dup.jpg
deleted file mode 120000
index f539c0972a..0000000000
--- a/tests/ops/data/img3_dup_dup.jpg
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3_dup.jpg
\ No newline at end of file
diff --git a/tests/ops/data/video1_dup.mp4 b/tests/ops/data/video1_dup.mp4
deleted file mode 120000
index 6d1bbbc84b..0000000000
--- a/tests/ops/data/video1_dup.mp4
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video1.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video2_dup.mp4 b/tests/ops/data/video2_dup.mp4
deleted file mode 120000
index 8fa6335be5..0000000000
--- a/tests/ops/data/video2_dup.mp4
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video2.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video3_dup.mp4 b/tests/ops/data/video3_dup.mp4
deleted file mode 120000
index f631588609..0000000000
--- a/tests/ops/data/video3_dup.mp4
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3.mp4
\ No newline at end of file
diff --git a/tests/ops/data/video3_dup_dup.mp4 b/tests/ops/data/video3_dup_dup.mp4
deleted file mode 120000
index 6a225ba396..0000000000
--- a/tests/ops/data/video3_dup_dup.mp4
+++ /dev/null
@@ -1 +0,0 @@
-/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3_dup.mp4
\ No newline at end of file