modelscope · cyruszhang · Feb 27, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/tools/dedupe_suite/configs/c4-baseline.yaml b/tools/dedupe_suite/configs/c4-baseline.yaml
@@ -0,0 +1,54 @@
+# redpajama-c4-refine with more aggressive filtering and deduplication to produce benchmark dataset
+
+# global parameters
+project_name: 'Data-Juicer-recipes-c4'
+dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
+export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file
+
+np: 50  # number of subprocess to process your dataset
+open_tracer: True
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+  - clean_email_mapper:
+  - clean_links_mapper:
+  - fix_unicode_mapper:
+  - punctuation_normalization_mapper:
+  - whitespace_normalization_mapper:
+
+  - alphanumeric_filter:
+      tokenization: false
+      min_ratio: 0.65  # <3sigma (0.740)
+      max_ratio: 0.9   # >3sigma (0.867)
+  - average_line_length_filter:  # for code
+      max_len: 3000  # >3sigma (1277)
+  - character_repetition_filter:
+      rep_len: 10
+      max_ratio: 0.3  # >3sigma (0.168)
+  - language_id_score_filter:
+      min_score: 0.6
+  - maximum_line_length_filter:  # for code
+      max_len: 4000  # >3sigma (2017)
+  - perplexity_filter:
+      lang: en
+      max_ppl: 6000 #(>3sigma 4543)
+  - special_characters_filter:
+      max_ratio: 0.4  # > 3sigma (0.303)
+  - words_num_filter:
+      tokenization: true
+      min_num: 20
+      max_num: 10000
+  - word_repetition_filter:
+      lang: en
+      tokenization: true
+      rep_len: 10
+      max_ratio: 0.231  # 3sigma
+
+  - document_simhash_deduplicator:
+      tokenization: space
+      window_size: 6
+      lowercase: true
+      ignore_pattern: '\p{P}'
+      num_blocks: 6
+      hamming_distance: 4
diff --git a/tools/dedupe_suite/dupe_gen/app.py b/tools/dedupe_suite/dupe_gen/app.py
@@ -0,0 +1,91 @@
+# Example usage
+from .generator import DuplicateGenerator, DuplicationConfig
+
+
+def generate_benchmarks(base_dataset_path='data/c4_sample.jsonl'):
+    """Generate benchmark datasets with various duplication configurations
+
+    Args:
+        base_dataset_path: Path to the base dataset file (JSONL format)
+    """
+    # 1. Basic usage with default settings
+    generator = DuplicateGenerator()
+    stats = generator.generate_from_dataset(
+        dataset_path=base_dataset_path,
+        output_path='output_with_duplicates.jsonl')
+
+    # 2. Clustered duplicates with specific ratio
+    #    clustered_config = DuplicationConfig(ratio=0.3,
+    #                                         distribution='clustered',
+    #                                         cluster_size=5,
+    #                                         types={
+    #                                             'exact': 0.2,
+    #                                             'near': 0.6,
+    #                                             'far': 0.2
+    #                                         })
+
+    # clustered_generator = DuplicateGenerator(clustered_config)
+    # clustered_stats = clustered_generator.generate_from_dataset(
+    #    dataset_path=base_dataset_path,
+    #     output_path='output_clustered_duplicates.jsonl')
+
+    # 3. High duplication rate with mostly near-duplicates
+    # high_dup_config = DuplicationConfig(
+    #    ratio=0.7,
+    #    types={
+    #        'exact': 0.1,
+    #        'near': 0.8,
+    #        'far': 0.1
+    #    },
+    #    modification_levels={
+    #        'near': 0.05,
+    #        'far': 0.2
+    #    }  # Very subtle near-duplicates
+    #    )
+
+    # high_dup_generator = DuplicateGenerator(high_dup_config)
+    # high_dup_stats = high_dup_generator.generate_from_dataset(
+    #    dataset_path=base_dataset_path,
+    #    output_path='output_high_duplication.jsonl')
+
+    # 4. Generate benchmarks of different sizes
+    for size_name, num_docs in [('small', 10000), ('medium', 100000),
+                                ('large', 1000000)]:
+        # Create sample of appropriate size
+        sample_path = f'sample_{size_name}.jsonl'
+        with open(sample_path, 'w') as outfile:
+            with open(base_dataset_path, 'r') as infile:
+                for i, line in enumerate(infile):
+                    if i >= num_docs:
+                        break
+                    outfile.write(line)
+
+        # Generate duplicates with different configurations
+        for dup_rate in [0.1, 0.3, 0.5]:
+            for dist in ['random', 'clustered']:
+                config = DuplicationConfig(ratio=dup_rate, distribution=dist)
+                generator = DuplicateGenerator(config)
+                fn = f'output_{size_name}_{dist}_{int(dup_rate*100)}pct_dups.jsonl'  # noqa
+                stats = generator.generate_from_dataset(
+                    dataset_path=sample_path, output_path=fn)
+
+                print(f'Generated {size_name} benchmark '
+                      f'with {dist} distribution '
+                      f'and {dup_rate*100}% duplicates')
+                print(f'Stats: {stats}')
+
+
+# If this file is run directly
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark datasets with controlled duplication')
+    parser.add_argument('--dataset',
+                        type=str,
+                        default='data/c4_sample.jsonl',
+                        help='Path to the base dataset file (JSONL format)')
+
+    args = parser.parse_args()
+
+    generate_benchmarks(args.dataset)
diff --git a/tools/dedupe_suite/dupe_gen/distributor.py b/tools/dedupe_suite/dupe_gen/distributor.py
@@ -0,0 +1,135 @@
+# dupgen/distributors.py
+import random
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+
+class DuplicateDistributor:
+    """Controls how duplicates are distributed in the dataset"""
+
+    @staticmethod
+    def random_distribution(originals: List[Dict],
+                            num_duplicates: int) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates randomly across originals
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+            to create
+        """
+        distribution = []
+
+        # Simple random selection with replacement
+        for _ in range(num_duplicates):
+            original = random.choice(originals)
+            distribution.append((original, 1))
+
+        return distribution
+
+    @staticmethod
+    def clustered_distribution(
+            originals: List[Dict],
+            num_duplicates: int,
+            avg_cluster_size: float = 5.0,
+            variance: float = 0.5) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates in clusters
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+            avg_cluster_size: Average number of duplicates per original
+            variance: Variance in cluster sizes (0-1, higher means more
+            variance)
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+              to create
+        """
+        # Determine how many originals to use
+        num_clusters = max(1, int(num_duplicates / avg_cluster_size))
+
+        # Select originals to duplicate
+        selected_originals = random.sample(originals,
+                                           k=min(num_clusters, len(originals)))
+
+        # Generate cluster sizes following a power law distribution
+        alpha = 2.0  # Power law exponent (adjust for different distributions)
+        sizes = np.random.power(
+            alpha, size=len(selected_originals)) * avg_cluster_size * 2
+        sizes = np.maximum(sizes, 1)  # Ensure at least size 1
+
+        # Apply variance
+        if variance > 0:
+            # Add noise proportional to variance
+            noise = np.random.normal(0,
+                                     variance * avg_cluster_size,
+                                     size=len(sizes))
+            sizes = np.maximum(sizes + noise, 1)
+
+        # Convert to integers
+        sizes = sizes.astype(int)
+
+        # Adjust to match total required duplicates
+        total = sum(sizes)
+        if total > num_duplicates:
+            # Scale down
+            sizes = np.floor(sizes * (num_duplicates / total)).astype(int)
+            # Distribute remaining
+            remainder = num_duplicates - sum(sizes)
+            for i in range(remainder):
+                sizes[i % len(sizes)] += 1
+        elif total < num_duplicates:
+            # Scale up
+            deficit = num_duplicates - total
+            # Distribute deficit
+            for i in range(deficit):
+                sizes[i % len(sizes)] += 1
+
+        # Create distribution
+        distribution = [(original, int(size))
+                        for original, size in zip(selected_originals, sizes)]
+
+        return distribution
+
+    @staticmethod
+    def power_law_distribution(
+            originals: List[Dict],
+            num_duplicates: int,
+            exponent: float = 2.0) -> List[Tuple[Dict, int]]:
+        """Distribute duplicates following a power law (few originals get
+        many duplicates)
+
+        Args:
+            originals: List of original documents
+            num_duplicates: Number of duplicates to create
+            exponent: Power law exponent (higher means more skewed
+              distribution)
+
+        Returns:
+            List of (original, count) tuples indicating how many duplicates
+            to create
+        """
+        # Select a subset of originals to duplicate
+        num_to_duplicate = min(len(originals), max(1,
+                                                   int(len(originals) * 0.1)))
+        selected_originals = random.sample(originals, k=num_to_duplicate)
+
+        # Generate power law weights
+        weights = np.power(np.arange(1, num_to_duplicate + 1), -exponent)
+        weights = weights / np.sum(weights)  # Normalize
+
+        # Distribute duplicates according to weights
+        counts = np.zeros(num_to_duplicate, dtype=int)
+        for _ in range(num_duplicates):
+            idx = np.random.choice(num_to_duplicate, p=weights)
+            counts[idx] += 1
+
+        # Create distribution
+        distribution = [(original, int(count))
+                        for original, count in zip(selected_originals, counts)]
+
+        return distribution