Skip to content

Commit 9dbc1ca

Browse files
committed
parallelize better
1 parent 2501d11 commit 9dbc1ca

File tree

1 file changed

+18
-15
lines changed

1 file changed

+18
-15
lines changed

xklb/media/dedupe.py

+18-15
Original file line numberDiff line numberDiff line change
@@ -401,22 +401,25 @@ def get_fs_duplicates(args) -> List[dict]:
401401
len(sample_hash_groups),
402402
)
403403

404+
with ThreadPoolExecutor(max_workers=20) as pool:
405+
path_hash_map = {
406+
k: v for k, v in zip(sample_hash_paths, pool.map(sample_compare.full_hash_file, sample_hash_paths))
407+
}
408+
409+
full_hash_groups = defaultdict(list)
410+
for path, hash in path_hash_map.items():
411+
if hash is not None:
412+
full_hash_groups[hash].append(path)
413+
full_hash_groups = [l for l in full_hash_groups.values() if len(l) > 1]
414+
404415
dup_media = []
405-
for g in sample_hash_groups:
406-
check_paths = [d["path"] for d in g]
407-
with ThreadPoolExecutor(max_workers=5) as pool:
408-
hash_results = list(pool.map(sample_compare.full_hash_file, check_paths))
409-
hash_groups = defaultdict(list)
410-
for path, hash in zip(check_paths, hash_results):
411-
if hash is not None:
412-
hash_groups[hash].append(path)
413-
for paths in hash_groups.values():
414-
if len(paths) > 1:
415-
keep_path = paths[0]
416-
dup_media.extend(
417-
{"keep_path": keep_path, "duplicate_path": p, "duplicate_size": path_media_map[keep_path]['size']}
418-
for p in paths[1:]
419-
)
416+
for hash_group_paths in full_hash_groups:
417+
paths = [d['path'] for d in media if d['path'] in hash_group_paths] # get the correct order from media
418+
keep_path = paths[0]
419+
dup_media.extend(
420+
{"keep_path": keep_path, "duplicate_path": p, "duplicate_size": path_media_map[keep_path]['size']}
421+
for p in paths[1:]
422+
)
420423

421424
# TODO: update false-positive sample-hash matches? probably no because then future sample-hash duplicates won't match
422425

0 commit comments

Comments
 (0)