From 55fa9a969d956abd604c6036fb604472dd36212d Mon Sep 17 00:00:00 2001 From: cyruszhang Date: Thu, 27 Feb 2025 10:11:57 -0800 Subject: [PATCH 1/3] add initial config for c4 refine --- tools/dedupe_suite/configs/c4-benchmark.yaml | 54 ++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tools/dedupe_suite/configs/c4-benchmark.yaml diff --git a/tools/dedupe_suite/configs/c4-benchmark.yaml b/tools/dedupe_suite/configs/c4-benchmark.yaml new file mode 100644 index 0000000000..4fc482adae --- /dev/null +++ b/tools/dedupe_suite/configs/c4-benchmark.yaml @@ -0,0 +1,54 @@ +# redpajama-c4-refine with more aggressive filtering and deduplication to produce benchmark dataset + +# global parameters +project_name: 'Data-Juicer-recipes-c4' +dataset_path: '/path/to/your/dataset' # path to your dataset directory or file +export_path: '/path/to/your/dataset.jsonl' # path to your dataset result file + +np: 50 # number of subprocess to process your dataset +open_tracer: True + +# process schedule +# a list of several process operators with their arguments +process: + - clean_email_mapper: + - clean_links_mapper: + - fix_unicode_mapper: + - punctuation_normalization_mapper: + - whitespace_normalization_mapper: + + - alphanumeric_filter: + tokenization: false + min_ratio: 0.65 # <3sigma (0.740) + max_ratio: 0.9 # >3sigma (0.867) + - average_line_length_filter: # for code + max_len: 3000 # >3sigma (1277) + - character_repetition_filter: + rep_len: 10 + max_ratio: 0.3 # >3sigma (0.168) + - language_id_score_filter: + min_score: 0.6 + - maximum_line_length_filter: # for code + max_len: 4000 # >3sigma (2017) + - perplexity_filter: + lang: en + max_ppl: 6000 #(>3sigma 4543) + - special_characters_filter: + max_ratio: 0.4 # > 3sigma (0.303) + - words_num_filter: + tokenization: true + min_num: 20 + max_num: 10000 + - word_repetition_filter: + lang: en + tokenization: true + rep_len: 10 + max_ratio: 0.231 # 3sigma + + - document_simhash_deduplicator: + tokenization: space + window_size: 6 + lowercase: true + ignore_pattern: '\p{P}' + num_blocks: 6 + hamming_distance: 4 From 6f8bdede7af3b43af64733838c58ad036d3fd55c Mon Sep 17 00:00:00 2001 From: cyruszhang Date: Tue, 4 Mar 2025 13:44:27 -0800 Subject: [PATCH 2/3] add dupe gen version 0.1 --- tests/ops/data/img1_dup.png | 1 + tests/ops/data/img2_dup.jpg | 1 + tests/ops/data/img3_dup.jpg | 1 + tests/ops/data/img3_dup_dup.jpg | 1 + tests/ops/data/video1_dup.mp4 | 1 + tests/ops/data/video2_dup.mp4 | 1 + tests/ops/data/video3_dup.mp4 | 1 + tests/ops/data/video3_dup_dup.mp4 | 1 + .../{c4-benchmark.yaml => c4-baseline.yaml} | 0 tools/dedupe_suite/dupe_gen/app.py | 91 +++++ tools/dedupe_suite/dupe_gen/distributor.py | 135 +++++++ tools/dedupe_suite/dupe_gen/generator.py | 345 ++++++++++++++++++ tools/dedupe_suite/dupe_gen/modifier.py | 180 +++++++++ 13 files changed, 759 insertions(+) create mode 120000 tests/ops/data/img1_dup.png create mode 120000 tests/ops/data/img2_dup.jpg create mode 120000 tests/ops/data/img3_dup.jpg create mode 120000 tests/ops/data/img3_dup_dup.jpg create mode 120000 tests/ops/data/video1_dup.mp4 create mode 120000 tests/ops/data/video2_dup.mp4 create mode 120000 tests/ops/data/video3_dup.mp4 create mode 120000 tests/ops/data/video3_dup_dup.mp4 rename tools/dedupe_suite/configs/{c4-benchmark.yaml => c4-baseline.yaml} (100%) create mode 100644 tools/dedupe_suite/dupe_gen/app.py create mode 100644 tools/dedupe_suite/dupe_gen/distributor.py create mode 100644 tools/dedupe_suite/dupe_gen/generator.py create mode 100644 tools/dedupe_suite/dupe_gen/modifier.py diff --git a/tests/ops/data/img1_dup.png b/tests/ops/data/img1_dup.png new file mode 120000 index 0000000000..d62a859006 --- /dev/null +++ b/tests/ops/data/img1_dup.png @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img1.png \ No newline at end of file diff --git a/tests/ops/data/img2_dup.jpg b/tests/ops/data/img2_dup.jpg new file mode 120000 index 0000000000..8a99a25260 --- /dev/null +++ b/tests/ops/data/img2_dup.jpg @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img2.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup.jpg b/tests/ops/data/img3_dup.jpg new file mode 120000 index 0000000000..6e8c435e31 --- /dev/null +++ b/tests/ops/data/img3_dup.jpg @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup_dup.jpg b/tests/ops/data/img3_dup_dup.jpg new file mode 120000 index 0000000000..f539c0972a --- /dev/null +++ b/tests/ops/data/img3_dup_dup.jpg @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3_dup.jpg \ No newline at end of file diff --git a/tests/ops/data/video1_dup.mp4 b/tests/ops/data/video1_dup.mp4 new file mode 120000 index 0000000000..6d1bbbc84b --- /dev/null +++ b/tests/ops/data/video1_dup.mp4 @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video1.mp4 \ No newline at end of file diff --git a/tests/ops/data/video2_dup.mp4 b/tests/ops/data/video2_dup.mp4 new file mode 120000 index 0000000000..8fa6335be5 --- /dev/null +++ b/tests/ops/data/video2_dup.mp4 @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video2.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup.mp4 b/tests/ops/data/video3_dup.mp4 new file mode 120000 index 0000000000..f631588609 --- /dev/null +++ b/tests/ops/data/video3_dup.mp4 @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup_dup.mp4 b/tests/ops/data/video3_dup_dup.mp4 new file mode 120000 index 0000000000..6a225ba396 --- /dev/null +++ b/tests/ops/data/video3_dup_dup.mp4 @@ -0,0 +1 @@ +/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3_dup.mp4 \ No newline at end of file diff --git a/tools/dedupe_suite/configs/c4-benchmark.yaml b/tools/dedupe_suite/configs/c4-baseline.yaml similarity index 100% rename from tools/dedupe_suite/configs/c4-benchmark.yaml rename to tools/dedupe_suite/configs/c4-baseline.yaml diff --git a/tools/dedupe_suite/dupe_gen/app.py b/tools/dedupe_suite/dupe_gen/app.py new file mode 100644 index 0000000000..f5de54e5d9 --- /dev/null +++ b/tools/dedupe_suite/dupe_gen/app.py @@ -0,0 +1,91 @@ +# Example usage +from .generator import DuplicateGenerator, DuplicationConfig + + +def generate_benchmarks(base_dataset_path='data/c4_sample.jsonl'): + """Generate benchmark datasets with various duplication configurations + + Args: + base_dataset_path: Path to the base dataset file (JSONL format) + """ + # 1. Basic usage with default settings + generator = DuplicateGenerator() + stats = generator.generate_from_dataset( + dataset_path=base_dataset_path, + output_path='output_with_duplicates.jsonl') + + # 2. Clustered duplicates with specific ratio + # clustered_config = DuplicationConfig(ratio=0.3, + # distribution='clustered', + # cluster_size=5, + # types={ + # 'exact': 0.2, + # 'near': 0.6, + # 'far': 0.2 + # }) + + # clustered_generator = DuplicateGenerator(clustered_config) + # clustered_stats = clustered_generator.generate_from_dataset( + # dataset_path=base_dataset_path, + # output_path='output_clustered_duplicates.jsonl') + + # 3. High duplication rate with mostly near-duplicates + # high_dup_config = DuplicationConfig( + # ratio=0.7, + # types={ + # 'exact': 0.1, + # 'near': 0.8, + # 'far': 0.1 + # }, + # modification_levels={ + # 'near': 0.05, + # 'far': 0.2 + # } # Very subtle near-duplicates + # ) + + # high_dup_generator = DuplicateGenerator(high_dup_config) + # high_dup_stats = high_dup_generator.generate_from_dataset( + # dataset_path=base_dataset_path, + # output_path='output_high_duplication.jsonl') + + # 4. Generate benchmarks of different sizes + for size_name, num_docs in [('small', 10000), ('medium', 100000), + ('large', 1000000)]: + # Create sample of appropriate size + sample_path = f'sample_{size_name}.jsonl' + with open(sample_path, 'w') as outfile: + with open(base_dataset_path, 'r') as infile: + for i, line in enumerate(infile): + if i >= num_docs: + break + outfile.write(line) + + # Generate duplicates with different configurations + for dup_rate in [0.1, 0.3, 0.5]: + for dist in ['random', 'clustered']: + config = DuplicationConfig(ratio=dup_rate, distribution=dist) + generator = DuplicateGenerator(config) + fn = f'output_{size_name}_{dist}_{int(dup_rate*100)}pct_dups.jsonl' # noqa + stats = generator.generate_from_dataset( + dataset_path=sample_path, output_path=fn) + + print(f'Generated {size_name} benchmark ' + f'with {dist} distribution ' + f'and {dup_rate*100}% duplicates') + print(f'Stats: {stats}') + + +# If this file is run directly +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + description='Generate benchmark datasets with controlled duplication') + parser.add_argument('--dataset', + type=str, + default='data/c4_sample.jsonl', + help='Path to the base dataset file (JSONL format)') + + args = parser.parse_args() + + generate_benchmarks(args.dataset) diff --git a/tools/dedupe_suite/dupe_gen/distributor.py b/tools/dedupe_suite/dupe_gen/distributor.py new file mode 100644 index 0000000000..ea37abc2c7 --- /dev/null +++ b/tools/dedupe_suite/dupe_gen/distributor.py @@ -0,0 +1,135 @@ +# dupgen/distributors.py +import random +from typing import Dict, List, Tuple + +import numpy as np + + +class DuplicateDistributor: + """Controls how duplicates are distributed in the dataset""" + + @staticmethod + def random_distribution(originals: List[Dict], + num_duplicates: int) -> List[Tuple[Dict, int]]: + """Distribute duplicates randomly across originals + + Args: + originals: List of original documents + num_duplicates: Number of duplicates to create + + Returns: + List of (original, count) tuples indicating how many duplicates + to create + """ + distribution = [] + + # Simple random selection with replacement + for _ in range(num_duplicates): + original = random.choice(originals) + distribution.append((original, 1)) + + return distribution + + @staticmethod + def clustered_distribution( + originals: List[Dict], + num_duplicates: int, + avg_cluster_size: float = 5.0, + variance: float = 0.5) -> List[Tuple[Dict, int]]: + """Distribute duplicates in clusters + + Args: + originals: List of original documents + num_duplicates: Number of duplicates to create + avg_cluster_size: Average number of duplicates per original + variance: Variance in cluster sizes (0-1, higher means more + variance) + + Returns: + List of (original, count) tuples indicating how many duplicates + to create + """ + # Determine how many originals to use + num_clusters = max(1, int(num_duplicates / avg_cluster_size)) + + # Select originals to duplicate + selected_originals = random.sample(originals, + k=min(num_clusters, len(originals))) + + # Generate cluster sizes following a power law distribution + alpha = 2.0 # Power law exponent (adjust for different distributions) + sizes = np.random.power( + alpha, size=len(selected_originals)) * avg_cluster_size * 2 + sizes = np.maximum(sizes, 1) # Ensure at least size 1 + + # Apply variance + if variance > 0: + # Add noise proportional to variance + noise = np.random.normal(0, + variance * avg_cluster_size, + size=len(sizes)) + sizes = np.maximum(sizes + noise, 1) + + # Convert to integers + sizes = sizes.astype(int) + + # Adjust to match total required duplicates + total = sum(sizes) + if total > num_duplicates: + # Scale down + sizes = np.floor(sizes * (num_duplicates / total)).astype(int) + # Distribute remaining + remainder = num_duplicates - sum(sizes) + for i in range(remainder): + sizes[i % len(sizes)] += 1 + elif total < num_duplicates: + # Scale up + deficit = num_duplicates - total + # Distribute deficit + for i in range(deficit): + sizes[i % len(sizes)] += 1 + + # Create distribution + distribution = [(original, int(size)) + for original, size in zip(selected_originals, sizes)] + + return distribution + + @staticmethod + def power_law_distribution( + originals: List[Dict], + num_duplicates: int, + exponent: float = 2.0) -> List[Tuple[Dict, int]]: + """Distribute duplicates following a power law (few originals get + many duplicates) + + Args: + originals: List of original documents + num_duplicates: Number of duplicates to create + exponent: Power law exponent (higher means more skewed + distribution) + + Returns: + List of (original, count) tuples indicating how many duplicates + to create + """ + # Select a subset of originals to duplicate + num_to_duplicate = min(len(originals), max(1, + int(len(originals) * 0.1))) + selected_originals = random.sample(originals, k=num_to_duplicate) + + # Generate power law weights + weights = np.power(np.arange(1, num_to_duplicate + 1), -exponent) + weights = weights / np.sum(weights) # Normalize + + # Distribute duplicates according to weights + counts = np.zeros(num_to_duplicate, dtype=int) + for _ in range(num_duplicates): + idx = np.random.choice(num_to_duplicate, p=weights) + counts[idx] += 1 + + # Create distribution + distribution = [(original, int(count)) + for original, count in zip(selected_originals, counts)] + + return distribution diff --git a/tools/dedupe_suite/dupe_gen/generator.py b/tools/dedupe_suite/dupe_gen/generator.py new file mode 100644 index 0000000000..6d7ec4763f --- /dev/null +++ b/tools/dedupe_suite/dupe_gen/generator.py @@ -0,0 +1,345 @@ +import hashlib +import json +import multiprocessing as mp +import os +import random +from dataclasses import dataclass +from typing import Any, Dict, List + +from tqdm import tqdm + +from .distributor import DuplicateDistributor +from .modifier import ModificationStrategy + + +@dataclass +class DuplicationConfig: + """Configuration for duplication generation""" + ratio: float = 0.3 # Percentage of duplicates in final dataset + types: Dict[ + str, + float] = None # Distribution of duplicate types (exact, near, far) + distribution: str = 'random' # How duplicates are distributed + cluster_size: int = 5 # Average size of duplicate clusters + cluster_variance: float = 0.5 # Variance in cluster sizes + modification_levels: Dict[str, + float] = None # How much to modify for each type + cross_source: bool = False # Whether to create duplicates across sources + + def __post_init__(self): + # Default duplicate type distribution + if self.types is None: + self.types = {'exact': 0.2, 'near': 0.5, 'far': 0.3} + + # Default modification levels + if self.modification_levels is None: + self.modification_levels = { + 'near': 0.1, # 10% modification + 'far': 0.3 # 30% modification + } + + # Validate configuration + assert sum(self.types.values() + ) == 1.0, 'Duplicate type probabilities must sum to 1.0' + assert 0 <= self.ratio <= 0.9, 'Dup ratio must be between 0 and 0.9' + + +class DuplicateGenerator: + """Main class for generating controlled duplicates""" + + def __init__(self, config: DuplicationConfig = None): + self.config = config or DuplicationConfig() + + # Initialize modification strategy + self.modification_strategy = ModificationStrategy() + + # Use the distributor class instead of inline distribution logic + self.distributor = DuplicateDistributor() + + # Map duplicate types to modifier methods + self.modifiers = { + 'exact': self._modify_exact, + 'near': self._modify_near, + 'far': self._modify_far + } + + def generate_from_dataset(self, + dataset_path: str, + output_path: str, + text_field: str = 'text', + id_field: str = 'id', + chunk_size: int = 100000, + num_processes: int = None) -> Dict[str, Any]: + """Generate duplicates from an existing dataset file""" + + # Determine number of processes + if num_processes is None: + num_processes = max(1, mp.cpu_count() - 1) + + # Count lines in file to determine total size + total_lines = sum(1 for _ in open(dataset_path, 'r')) + + # Calculate number of chunks + num_chunks = (total_lines + chunk_size - 1) // chunk_size + + # Process in chunks + results = [] + for chunk_idx in range(num_chunks): + print(f'Processing chunk {chunk_idx+1}/{num_chunks}') + + # Read chunk of documents + start_line = chunk_idx * chunk_size + end_line = min(start_line + chunk_size, total_lines) + + documents = [] + with open(dataset_path, 'r') as f: + for i, line in enumerate(f): + if i < start_line: + continue + if i >= end_line: + break + try: + doc = json.loads(line) + documents.append(doc) + except json.JSONDecodeError: + continue + + # Generate duplicates for this chunk + chunk_result = self._process_chunk( + documents, + output_path=f'{output_path}.chunk{chunk_idx}', + text_field=text_field, + id_field=id_field, + num_processes=num_processes) + + results.append(chunk_result) + + # Combine chunks if needed + if num_chunks > 1: + self._combine_chunks(output_path, num_chunks) + + # Aggregate statistics + stats = self._aggregate_stats(results) + + # Save statistics + with open(f'{output_path}.stats.json', 'w') as f: + json.dump(stats, f, indent=2) + + return stats + + def _process_chunk(self, documents: List[Dict[str, Any]], output_path: str, + text_field: str, id_field: str, + num_processes: int) -> Dict[str, Any]: + """Process a chunk of documents to generate duplicates""" + + # Calculate how many duplicates to create + num_originals = len(documents) + num_duplicates = int(num_originals * self.config.ratio / + (1 - self.config.ratio)) + + # Use the distributor class instead of inline distribution logic + if self.config.distribution == 'clustered': + distribution = self.distributor.clustered_distribution( + documents, + num_duplicates, + avg_cluster_size=self.config.cluster_size, + variance=self.config.cluster_variance) + elif self.config.distribution == 'power_law': + distribution = self.distributor.power_law_distribution( + documents, + num_duplicates, + exponent=2.0 # Could be configurable + ) + else: # "random" + distribution = self.distributor.random_distribution( + documents, num_duplicates) + + # Process distribution to create duplicates + duplicates = [] + + # Prepare arguments for parallel processing + args = [] + for original, count in distribution: + args.append((original, count, text_field, id_field)) + + # Process in parallel + with mp.Pool(num_processes) as pool: + duplicate_clusters = list( + tqdm(pool.imap(self._generate_cluster, args), + total=len(args), + desc='Generating duplicate clusters')) + + # Flatten clusters + for cluster in duplicate_clusters: + duplicates.extend(cluster) + + # Add metadata to original documents + for doc in documents: + if 'is_duplicate' not in doc: + doc['is_duplicate'] = False + doc['original_id'] = None + doc['duplicate_type'] = None + + # Combine and shuffle + all_documents = documents + duplicates + random.shuffle(all_documents) + + # Write to output file + with open(output_path, 'w') as f: + for doc in all_documents: + f.write(json.dumps(doc) + '\n') + + # Return statistics + return { + 'total_documents': len(all_documents), + 'original_documents': len(documents), + 'duplicate_documents': len(duplicates), + 'duplication_ratio': len(duplicates) / len(all_documents), + 'duplication_types': { + dup_type: sum(1 for d in duplicates + if d.get('duplicate_type') == dup_type) + for dup_type in self.config.types.keys() + } + } + + def _generate_cluster(self, args): + """Generate a cluster of duplicates from a single original""" + original, cluster_size, text_field, id_field = args + + duplicates = [] + for _ in range(cluster_size): + dup_type = self._select_duplicate_type() + duplicate = self._generate_duplicate( + (original, dup_type, text_field, id_field)) + duplicates.append(duplicate) + + return duplicates + + def _generate_duplicate(self, args): + """Generate a single duplicate from an original document""" + original, dup_type, text_field, id_field = args + + # Create base duplicate + duplicate = original.copy() + + # Add metadata + duplicate['is_duplicate'] = True + duplicate['original_id'] = original.get(id_field, 'unknown') + duplicate['duplicate_type'] = dup_type + + # Generate new ID + duplicate[id_field] = hashlib.md5( + (str(original.get(id_field, '')) + + str(random.random())).encode()).hexdigest() + + # Apply modifications based on duplicate type + if text_field in duplicate: + modifier_func = self.modifiers.get(dup_type, self._modify_exact) + duplicate[text_field] = modifier_func(duplicate[text_field]) + + return duplicate + + def _select_duplicate_type(self): + """Select a duplicate type based on configured probabilities""" + types, probs = zip(*self.config.types.items()) + return random.choices(types, weights=probs, k=1)[0] + + def _modify_exact(self, text): + """No modification for exact duplicates""" + return text + + def _modify_near(self, text): + """Apply near-duplicate modifications using ModificationStrategy""" + # Get modification intensity from config + intensity = self.config.modification_levels.get('near', 0.1) + + # Use the ModificationStrategy to apply various modifications + return self.modification_strategy.apply(text, intensity=intensity) + + def _modify_far(self, text): + """Apply far-duplicate modifications using ModificationStrategy""" + # Get modification intensity from config + intensity = self.config.modification_levels.get('far', 0.3) + + # For far duplicates, we might want to apply more aggressive m + # odifications + # First shuffle paragraphs + paragraphs = text.split('\n') + if len(paragraphs) > 1: + random.shuffle(paragraphs) + + # Then apply modifications to each paragraph + modified_paragraphs = [] + for paragraph in paragraphs: + # Apply the modification strategy with higher intensity + modified = self.modification_strategy.apply(paragraph, + intensity=intensity) + modified_paragraphs.append(modified) + + return '\n'.join(modified_paragraphs) + + def _get_similar_word(self, word): + """Generate a similar word (placeholder implementation)""" + # In a real implementation, you might use: + # - Word embeddings to find semantically similar words + # - Character-level modifications + # - Synonym lookup + + # Simple implementation: modify the word slightly + if len(word) <= 3: + return word + + if random.random() < 0.5: + # Change a character + pos = random.randint(0, len(word) - 1) + chars = list(word) + chars[pos] = random.choice('abcdefghijklmnopqrstuvwxyz') + return ''.join(chars) + else: + # Add or remove a character + if random.random() < 0.5 and len(word) > 3: + # Remove + pos = random.randint(0, len(word) - 1) + return word[:pos] + word[pos + 1:] + else: + # Add + pos = random.randint(0, len(word)) + char = random.choice('abcdefghijklmnopqrstuvwxyz') + return word[:pos] + char + word[pos:] + + def _combine_chunks(self, output_path, num_chunks): + """Combine chunk files into a single output file""" + with open(output_path, 'w') as outfile: + for chunk_idx in range(num_chunks): + chunk_path = f'{output_path}.chunk{chunk_idx}' + with open(chunk_path, 'r') as infile: + for line in infile: + outfile.write(line) + + # Remove chunk file + os.remove(chunk_path) + + def _aggregate_stats(self, chunk_stats): + """Aggregate statistics from multiple chunks""" + total_stats = { + 'total_documents': 0, + 'original_documents': 0, + 'duplicate_documents': 0, + 'duplication_types': + {dup_type: 0 + for dup_type in self.config.types.keys()} + } + + for stats in chunk_stats: + total_stats['total_documents'] += stats['total_documents'] + total_stats['original_documents'] += stats['original_documents'] + total_stats['duplicate_documents'] += stats['duplicate_documents'] + + for dup_type, count in stats['duplication_types'].items(): + total_stats['duplication_types'][dup_type] += count + + total_stats['duplication_ratio'] = ( + total_stats['duplicate_documents'] / total_stats['total_documents'] + if total_stats['total_documents'] > 0 else 0) + + return total_stats diff --git a/tools/dedupe_suite/dupe_gen/modifier.py b/tools/dedupe_suite/dupe_gen/modifier.py new file mode 100644 index 0000000000..4cdfea3618 --- /dev/null +++ b/tools/dedupe_suite/dupe_gen/modifier.py @@ -0,0 +1,180 @@ +import random +import re +import string +from typing import Callable, Dict + + +class TextModifier: + """Advanced text modification strategies for creating near-duplicates""" + + @staticmethod + def character_swap(text: str, rate: float = 0.05) -> str: + """Swap characters randomly""" + chars = list(text) + swaps = max(1, int(len(chars) * rate)) + + for _ in range(swaps): + if len(chars) < 2: + break + i = random.randint(0, len(chars) - 2) + chars[i], chars[i + 1] = chars[i + 1], chars[i] + + return ''.join(chars) + + @staticmethod + def word_replacement(text: str, rate: float = 0.1) -> str: + """Replace words with similar ones""" + words = text.split() + replacements = max(1, int(len(words) * rate)) + + for _ in range(replacements): + if not words: + break + i = random.randint(0, len(words) - 1) + + # Simple replacement strategy (could be enhanced with word + # embeddings) + if len(words[i]) > 3: + # Replace with a slightly modified version + chars = list(words[i]) + pos = random.randint(0, len(chars) - 1) + chars[pos] = random.choice(string.ascii_lowercase) + words[i] = ''.join(chars) + + return ' '.join(words) + + @staticmethod + def sentence_reordering(text: str, rate: float = 0.3) -> str: + """Reorder sentences within paragraphs""" + paragraphs = text.split('\n') + + for i, paragraph in enumerate(paragraphs): + # Split into sentences (simple approach) + sentences = re.split(r'(?<=[.!?])\s+', paragraph) + if len(sentences) > 1: + # Shuffle some sentences + num_to_shuffle = max(1, int(len(sentences) * rate)) + indices = random.sample(range(len(sentences)), + k=num_to_shuffle) + + # Extract sentences to shuffle + to_shuffle = [sentences[j] for j in sorted(indices)] + random.shuffle(to_shuffle) + + # Put back in original positions + for idx, j in enumerate(sorted(indices)): + sentences[j] = to_shuffle[idx] + + paragraphs[i] = ' '.join(sentences) + + return '\n'.join(paragraphs) + + @staticmethod + def html_modification(text: str, rate: float = 0.2) -> str: + """Modify HTML attributes while preserving structure""" + + # This is a simplified version - a real implementation would + # use an HTML parser + + # Modify attributes in tags + def replace_attr(match): + tag = match.group(1) + attrs = match.group(2) + + # Randomly modify some attributes + if random.random() < rate: + # Add a random attribute + attrs += f' data-random="{random.randint(1, 1000)}"' + + return f'<{tag} {attrs}>' + + modified = re.sub(r'<(\w+)\s+([^>]+)>', replace_attr, text) + return modified + + @staticmethod + def whitespace_modification(text: str) -> str: + """Modify whitespace without changing content""" + # Replace multiple spaces with single space + modified = re.sub(r'\s+', ' ', text) + + # Randomly add extra newlines + sentences = re.split(r'(?<=[.!?])\s+', modified) + for i in range(len(sentences) - 1): + if random.random() < 0.2: + sentences[i] = sentences[i] + '\n' + + return ' '.join(sentences) + + @staticmethod + def case_modification(text: str, rate: float = 0.1) -> str: + """Change case of some words""" + words = text.split() + modifications = max(1, int(len(words) * rate)) + + for _ in range(modifications): + if not words: + break + i = random.randint(0, len(words) - 1) + + # Skip very short words + if len(words[i]) < 3: + continue + + # Apply case modification + mod_type = random.choice(['upper', 'lower', 'title']) + if mod_type == 'upper': + words[i] = words[i].upper() + elif mod_type == 'lower': + words[i] = words[i].lower() + else: + words[i] = words[i].title() + + return ' '.join(words) + + +class ModificationStrategy: + """Combines multiple modification strategies with configurable weights""" + + def __init__(self, strategies: Dict[Callable, float] = None): + """Initialize with strategies and their weights + + Args: + strategies: Dictionary mapping strategy functions to their weights + """ + if strategies is None: + # Default strategies and weights + self.strategies = { + TextModifier.character_swap: 0.2, + TextModifier.word_replacement: 0.3, + TextModifier.sentence_reordering: 0.2, + TextModifier.whitespace_modification: 0.1, + TextModifier.case_modification: 0.2 + } + else: + self.strategies = strategies + + def apply(self, text: str, intensity: float = 0.5) -> str: + """Apply modification strategies based on weights and intensity + + Args: + text: Text to modify + intensity: Overall modification intensity (0.0 to 1.0) + + Returns: + Modified text + """ + modified = text + + # Normalize weights + total_weight = sum(self.strategies.values()) + normalized_weights = { + k: v / total_weight + for k, v in self.strategies.items() + } + + # Apply strategies based on weights and intensity + for strategy, weight in normalized_weights.items(): + if random.random() < weight * intensity: + modified = strategy(modified) + + return modified From fa4ac51a0e987f26f6be173c959f8c9ee69786f7 Mon Sep 17 00:00:00 2001 From: cyruszhang Date: Tue, 4 Mar 2025 13:48:04 -0800 Subject: [PATCH 3/3] remove dup files --- tests/ops/data/img1_dup.png | 1 - tests/ops/data/img2_dup.jpg | 1 - tests/ops/data/img3_dup.jpg | 1 - tests/ops/data/img3_dup_dup.jpg | 1 - tests/ops/data/video1_dup.mp4 | 1 - tests/ops/data/video2_dup.mp4 | 1 - tests/ops/data/video3_dup.mp4 | 1 - tests/ops/data/video3_dup_dup.mp4 | 1 - 8 files changed, 8 deletions(-) delete mode 120000 tests/ops/data/img1_dup.png delete mode 120000 tests/ops/data/img2_dup.jpg delete mode 120000 tests/ops/data/img3_dup.jpg delete mode 120000 tests/ops/data/img3_dup_dup.jpg delete mode 120000 tests/ops/data/video1_dup.mp4 delete mode 120000 tests/ops/data/video2_dup.mp4 delete mode 120000 tests/ops/data/video3_dup.mp4 delete mode 120000 tests/ops/data/video3_dup_dup.mp4 diff --git a/tests/ops/data/img1_dup.png b/tests/ops/data/img1_dup.png deleted file mode 120000 index d62a859006..0000000000 --- a/tests/ops/data/img1_dup.png +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img1.png \ No newline at end of file diff --git a/tests/ops/data/img2_dup.jpg b/tests/ops/data/img2_dup.jpg deleted file mode 120000 index 8a99a25260..0000000000 --- a/tests/ops/data/img2_dup.jpg +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img2.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup.jpg b/tests/ops/data/img3_dup.jpg deleted file mode 120000 index 6e8c435e31..0000000000 --- a/tests/ops/data/img3_dup.jpg +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup_dup.jpg b/tests/ops/data/img3_dup_dup.jpg deleted file mode 120000 index f539c0972a..0000000000 --- a/tests/ops/data/img3_dup_dup.jpg +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/img3_dup.jpg \ No newline at end of file diff --git a/tests/ops/data/video1_dup.mp4 b/tests/ops/data/video1_dup.mp4 deleted file mode 120000 index 6d1bbbc84b..0000000000 --- a/tests/ops/data/video1_dup.mp4 +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video1.mp4 \ No newline at end of file diff --git a/tests/ops/data/video2_dup.mp4 b/tests/ops/data/video2_dup.mp4 deleted file mode 120000 index 8fa6335be5..0000000000 --- a/tests/ops/data/video2_dup.mp4 +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video2.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup.mp4 b/tests/ops/data/video3_dup.mp4 deleted file mode 120000 index f631588609..0000000000 --- a/tests/ops/data/video3_dup.mp4 +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup_dup.mp4 b/tests/ops/data/video3_dup_dup.mp4 deleted file mode 120000 index 6a225ba396..0000000000 --- a/tests/ops/data/video3_dup_dup.mp4 +++ /dev/null @@ -1 +0,0 @@ -/Users/yilei.z/dev/data-juicer/tests/ops/deduplicator/../data/video3_dup.mp4 \ No newline at end of file