Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions tools/dedupe_suite/configs/c4-baseline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# redpajama-c4-refine with more aggressive filtering and deduplication to produce benchmark dataset

# global parameters
project_name: 'Data-Juicer-recipes-c4'
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file

np: 50 # number of subprocess to process your dataset
open_tracer: True

# process schedule
# a list of several process operators with their arguments
process:
- clean_email_mapper:
- clean_links_mapper:
- fix_unicode_mapper:
- punctuation_normalization_mapper:
- whitespace_normalization_mapper:

- alphanumeric_filter:
tokenization: false
min_ratio: 0.65 # <3sigma (0.740)
max_ratio: 0.9 # >3sigma (0.867)
- average_line_length_filter: # for code
max_len: 3000 # >3sigma (1277)
- character_repetition_filter:
rep_len: 10
max_ratio: 0.3 # >3sigma (0.168)
- language_id_score_filter:
min_score: 0.6
- maximum_line_length_filter: # for code
max_len: 4000 # >3sigma (2017)
- perplexity_filter:
lang: en
max_ppl: 6000 #(>3sigma 4543)
- special_characters_filter:
max_ratio: 0.4 # > 3sigma (0.303)
- words_num_filter:
tokenization: true
min_num: 20
max_num: 10000
- word_repetition_filter:
lang: en
tokenization: true
rep_len: 10
max_ratio: 0.231 # 3sigma

- document_simhash_deduplicator:
tokenization: space
window_size: 6
lowercase: true
ignore_pattern: '\p{P}'
num_blocks: 6
hamming_distance: 4
91 changes: 91 additions & 0 deletions tools/dedupe_suite/dupe_gen/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Example usage
from .generator import DuplicateGenerator, DuplicationConfig


def generate_benchmarks(base_dataset_path='data/c4_sample.jsonl'):
"""Generate benchmark datasets with various duplication configurations

Args:
base_dataset_path: Path to the base dataset file (JSONL format)
"""
# 1. Basic usage with default settings
generator = DuplicateGenerator()
stats = generator.generate_from_dataset(
dataset_path=base_dataset_path,
output_path='output_with_duplicates.jsonl')

# 2. Clustered duplicates with specific ratio
# clustered_config = DuplicationConfig(ratio=0.3,
# distribution='clustered',
# cluster_size=5,
# types={
# 'exact': 0.2,
# 'near': 0.6,
# 'far': 0.2
# })

# clustered_generator = DuplicateGenerator(clustered_config)
# clustered_stats = clustered_generator.generate_from_dataset(
# dataset_path=base_dataset_path,
# output_path='output_clustered_duplicates.jsonl')

# 3. High duplication rate with mostly near-duplicates
# high_dup_config = DuplicationConfig(
# ratio=0.7,
# types={
# 'exact': 0.1,
# 'near': 0.8,
# 'far': 0.1
# },
# modification_levels={
# 'near': 0.05,
# 'far': 0.2
# } # Very subtle near-duplicates
# )

# high_dup_generator = DuplicateGenerator(high_dup_config)
# high_dup_stats = high_dup_generator.generate_from_dataset(
# dataset_path=base_dataset_path,
# output_path='output_high_duplication.jsonl')

# 4. Generate benchmarks of different sizes
for size_name, num_docs in [('small', 10000), ('medium', 100000),
('large', 1000000)]:
# Create sample of appropriate size
sample_path = f'sample_{size_name}.jsonl'
with open(sample_path, 'w') as outfile:
with open(base_dataset_path, 'r') as infile:
for i, line in enumerate(infile):
if i >= num_docs:
break
outfile.write(line)

# Generate duplicates with different configurations
for dup_rate in [0.1, 0.3, 0.5]:
for dist in ['random', 'clustered']:
config = DuplicationConfig(ratio=dup_rate, distribution=dist)
generator = DuplicateGenerator(config)
fn = f'output_{size_name}_{dist}_{int(dup_rate*100)}pct_dups.jsonl' # noqa
stats = generator.generate_from_dataset(
dataset_path=sample_path, output_path=fn)

print(f'Generated {size_name} benchmark '
f'with {dist} distribution '
f'and {dup_rate*100}% duplicates')
print(f'Stats: {stats}')


# If this file is run directly
if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(
description='Generate benchmark datasets with controlled duplication')
parser.add_argument('--dataset',
type=str,
default='data/c4_sample.jsonl',
help='Path to the base dataset file (JSONL format)')

args = parser.parse_args()

generate_benchmarks(args.dataset)
135 changes: 135 additions & 0 deletions tools/dedupe_suite/dupe_gen/distributor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# dupgen/distributors.py
import random
from typing import Dict, List, Tuple

import numpy as np


class DuplicateDistributor:
"""Controls how duplicates are distributed in the dataset"""

@staticmethod
def random_distribution(originals: List[Dict],
num_duplicates: int) -> List[Tuple[Dict, int]]:
"""Distribute duplicates randomly across originals

Args:
originals: List of original documents
num_duplicates: Number of duplicates to create

Returns:
List of (original, count) tuples indicating how many duplicates
to create
"""
distribution = []

# Simple random selection with replacement
for _ in range(num_duplicates):
original = random.choice(originals)
distribution.append((original, 1))

return distribution

@staticmethod
def clustered_distribution(
originals: List[Dict],
num_duplicates: int,
avg_cluster_size: float = 5.0,
variance: float = 0.5) -> List[Tuple[Dict, int]]:
"""Distribute duplicates in clusters

Args:
originals: List of original documents
num_duplicates: Number of duplicates to create
avg_cluster_size: Average number of duplicates per original
variance: Variance in cluster sizes (0-1, higher means more
variance)

Returns:
List of (original, count) tuples indicating how many duplicates
to create
"""
# Determine how many originals to use
num_clusters = max(1, int(num_duplicates / avg_cluster_size))

# Select originals to duplicate
selected_originals = random.sample(originals,
k=min(num_clusters, len(originals)))

# Generate cluster sizes following a power law distribution
alpha = 2.0 # Power law exponent (adjust for different distributions)
sizes = np.random.power(
alpha, size=len(selected_originals)) * avg_cluster_size * 2
sizes = np.maximum(sizes, 1) # Ensure at least size 1

# Apply variance
if variance > 0:
# Add noise proportional to variance
noise = np.random.normal(0,
variance * avg_cluster_size,
size=len(sizes))
sizes = np.maximum(sizes + noise, 1)

# Convert to integers
sizes = sizes.astype(int)

# Adjust to match total required duplicates
total = sum(sizes)
if total > num_duplicates:
# Scale down
sizes = np.floor(sizes * (num_duplicates / total)).astype(int)
# Distribute remaining
remainder = num_duplicates - sum(sizes)
for i in range(remainder):
sizes[i % len(sizes)] += 1
elif total < num_duplicates:
# Scale up
deficit = num_duplicates - total
# Distribute deficit
for i in range(deficit):
sizes[i % len(sizes)] += 1

# Create distribution
distribution = [(original, int(size))
for original, size in zip(selected_originals, sizes)]

return distribution

@staticmethod
def power_law_distribution(
originals: List[Dict],
num_duplicates: int,
exponent: float = 2.0) -> List[Tuple[Dict, int]]:
"""Distribute duplicates following a power law (few originals get
many duplicates)

Args:
originals: List of original documents
num_duplicates: Number of duplicates to create
exponent: Power law exponent (higher means more skewed
distribution)

Returns:
List of (original, count) tuples indicating how many duplicates
to create
"""
# Select a subset of originals to duplicate
num_to_duplicate = min(len(originals), max(1,
int(len(originals) * 0.1)))
selected_originals = random.sample(originals, k=num_to_duplicate)

# Generate power law weights
weights = np.power(np.arange(1, num_to_duplicate + 1), -exponent)
weights = weights / np.sum(weights) # Normalize

# Distribute duplicates according to weights
counts = np.zeros(num_to_duplicate, dtype=int)
for _ in range(num_duplicates):
idx = np.random.choice(num_to_duplicate, p=weights)
counts[idx] += 1

# Create distribution
distribution = [(original, int(count))
for original, count in zip(selected_originals, counts)]

return distribution
Loading
Loading