Skip to content

Create Cache class for exact, fuzzy, and semantic deduplication #384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
769e2ea
add global cache variable and use it for exact dedup
sarahyurick Nov 19, 2024
b77139c
global cache for semdedup
sarahyurick Nov 19, 2024
337cec8
run black and modify pytest
sarahyurick Nov 19, 2024
6d55d8c
update image notebook
sarahyurick Nov 20, 2024
622912b
Merge branch 'main' into global_cache_dir
sarahyurick Nov 20, 2024
4cb26d5
save fuzzy dedup progress
sarahyurick Nov 20, 2024
b001622
save progress
sarahyurick Nov 20, 2024
0c14626
update remaining docs
sarahyurick Nov 20, 2024
7486459
run black
sarahyurick Nov 20, 2024
053f312
Merge branch 'main' into global_cache_dir
sarahyurick Dec 6, 2024
1b1ba30
Merge branch 'main' into global_cache_dir
sarahyurick Dec 11, 2024
4b12651
Merge branch 'main' into global_cache_dir
sarahyurick Dec 17, 2024
4160471
Merge branch 'main' into global_cache_dir
sarahyurick Dec 20, 2024
8a22ace
Merge branch 'main' into global_cache_dir
sarahyurick Dec 23, 2024
5e9bef1
Merge branch 'main' into global_cache_dir
sarahyurick Jan 3, 2025
d823a0b
Merge remote-tracking branch 'upstream/main' into global_cache_dir
sarahyurick Jan 21, 2025
0890fb0
re-add get_cache_directory changes
sarahyurick Jan 21, 2025
8fd79fb
create Cache singleton class
sarahyurick Jan 21, 2025
0d7b969
update exact_dedup
sarahyurick Jan 22, 2025
2c1a435
add semdedup functionality with Cache
sarahyurick Jan 22, 2025
f0ff2ce
add semdedup_example script
sarahyurick Jan 22, 2025
a379893
Cache singleton option for fuzzy dedup
sarahyurick Jan 23, 2025
67f609c
run black
sarahyurick Jan 23, 2025
8693177
fix tutorials
sarahyurick Jan 23, 2025
c296cc7
Merge branch 'main' into global_cache_dir
sarahyurick Jan 29, 2025
510347c
Merge branch 'main' into global_cache_dir
sarahyurick Feb 18, 2025
0635ebf
run black
sarahyurick Feb 18, 2025
a229857
import assert_eq
sarahyurick Feb 18, 2025
30ec409
fix semdedup test
sarahyurick Feb 19, 2025
1a63468
Merge branch 'main' into global_cache_dir
sarahyurick Feb 20, 2025
2075588
Merge branch 'main' into global_cache_dir
sarahyurick Feb 25, 2025
a6c5de3
remove repeating param
sarahyurick Feb 25, 2025
b805ce9
Merge remote-tracking branch 'upstream/main' into global_cache_dir
sarahyurick Feb 28, 2025
2ee3547
fix semdedup test
sarahyurick Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/fuzzy_dedup_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
cache_dir: "./fuzzy_dedup_cache"

# Optional Params below with default values
# profile_dir: null
# id_field: "id"
Expand Down
2 changes: 0 additions & 2 deletions config/sem_dedup_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@ write_embeddings_to_disk: true
# Clustering configuration
clustering_save_loc: "clustering_results"
n_clusters: 1000
seed: 1234
max_iter: 100
kmeans_with_cos_dist: false

# Semdedup configuration
which_to_keep: "hard"
largest_cluster_size_to_process: 100000
sim_metric: "cosine"

# Extract dedup configuration
Expand Down
13 changes: 7 additions & 6 deletions docs/user-guide/semdedup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,11 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
# Clustering configuration
clustering_save_loc: "clustering_results"
n_clusters: 1000
seed: 1234
max_iter: 100
kmeans_with_cos_dist: false

# Semdedup configuration
which_to_keep: "hard"
largest_cluster_size_to_process: 100000
sim_metric: "cosine"

# Extract dedup configuration
Expand Down Expand Up @@ -170,7 +168,8 @@ Use Individual Components
embedding_creator = EmbeddingCreator(
embedding_model_name_or_path="path/to/pretrained/model",
embedding_batch_size=128,
embedding_output_dir="path/to/output/embeddings",
cache_dir="path/to/output",
embeddings_save_loc="embeddings",
input_column="text",
logger="path/to/log/dir",
)
Expand All @@ -188,7 +187,8 @@ Use Individual Components
id_column="doc_id",
max_iter=100,
n_clusters=50000,
clustering_output_dir="path/to/output/clusters",
cache_dir="path/to/output",
clustering_save_loc="clustering_results",
logger="path/to/log/dir"
)
clustered_dataset = clustering_model(embeddings_dataset)
Expand All @@ -202,12 +202,13 @@ Use Individual Components
# Step 3: Semantic Deduplication
semantic_dedup = SemanticClusterLevelDedup(
n_clusters=50000,
emb_by_clust_dir="path/to/embeddings/by/cluster",
sorted_clusters_dir="path/to/sorted/clusters",
id_column="doc_id",
id_column_type="str",
which_to_keep="hard",
output_dir="path/to/output/deduped",
# cache_dir and clustering_save_loc should match ClusteringModel
cache_dir="path/to/output",
clustering_save_loc="clustering_results",
logger="path/to/log/dir"
)
semantic_dedup.compute_semantic_match_dfs()
Expand Down
1 change: 0 additions & 1 deletion examples/exact_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def main(args):
if isinstance(duplicates, str):
duplicates = DocumentDataset.read_parquet(duplicates, backend=backend)

# It's easy to apply dataframe operations to the dataset by using the underlying df.
result = exact_dup.remove(input_dataset, duplicates)
write_to_disk(result, output_dir, output_type="parquet")
print(time.time() - t0)
Expand Down
6 changes: 3 additions & 3 deletions examples/fuzzy_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def main(args):

filetype = "parquet"

# Fuzzy dup calculation only supports the cuDF/GPU backend
# Fuzzy deduplication only supports the cuDF/GPU backend
backend = "cudf"
assert args.device == "gpu"

Expand Down Expand Up @@ -89,12 +89,12 @@ def main(args):

if duplicates is None:
print("No duplicates found")
print(f"Time taken:{time.time() - t0}s")
print(f"Time taken: {time.time() - t0}s")
return

result = fuzzy_dup.remove(input_dataset, duplicates)
write_to_disk(result, output_dir, output_type=filetype)
print(f"Time taken:{time.time() - t0}s")
print(f"Time taken: {time.time() - t0}s")


def attach_args(
Expand Down
7 changes: 7 additions & 0 deletions examples/semdedup_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,30 @@ def main(args):
log_level=logging.INFO,
stdout=True,
)

st = time.time()

input_files = get_all_files_paths_under(
root=args.input_data_dir,
)

if semdedup_config.num_files > 0:
input_files = input_files[: semdedup_config.num_files]

logger.info(f"Processing {len(input_files)} files")

ddf = read_data(
input_files=input_files,
file_type=args.input_file_type,
add_filename=False,
backend="cudf",
)
dataset = DocumentDataset(ddf)

semdup = SemDedup(semdedup_config, logger=logger)
dedup_ids = semdup(dataset)
print(dedup_ids.df.head())

logger.info(f"Time taken: {time.time() - st}")
client.cancel(client.futures, force=True)
client.close()
Expand Down
48 changes: 48 additions & 0 deletions nemo_curator/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_curator.utils.file_utils import expand_outdir_and_mkdir


class Cache:
_instance = None
_cache_dir = None

def __new__(cls, cache_dir=None):
if cls._instance is None:
cls._instance = super(Cache, cls).__new__(cls)
if cache_dir is not None:
cls._cache_dir = expand_outdir_and_mkdir(cache_dir)
else:
cls._cache_dir = None
elif cache_dir is not None and cls._cache_dir is None:
cls._cache_dir = expand_outdir_and_mkdir(cache_dir)
return cls._instance

@classmethod
def get_cache_directory(cls) -> str:
"""
Retrieve the cache directory.
"""
return cls._cache_dir

@classmethod
def delete_cache_instance(cls):
"""
Reset the Cache singleton.
"""
if cls._cache_dir is not None:
cls._cache_dir = None

cls._instance = None
Loading