Skip to content

Commit 2501d11

Browse files
committed
dedupe
1 parent 24b2a36 commit 2501d11

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

xklb/media/dedupe.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -376,12 +376,12 @@ def get_fs_duplicates(args) -> List[dict]:
376376

377377
path_media_map = {d["path"]: d for d in media}
378378

379-
sample_hash_paths = [d["path"] for d in media if not d.get("hash")]
380-
if sample_hash_paths:
379+
need_sample_hash_paths = [d["path"] for d in media if not d.get("hash")]
380+
if need_sample_hash_paths:
381381
with ThreadPoolExecutor(max_workers=20) as pool:
382-
hash_results = list(pool.map(sample_hash.sample_hash_file, sample_hash_paths))
382+
hash_results = list(pool.map(sample_hash.sample_hash_file, need_sample_hash_paths))
383383

384-
for path, hash in zip(sample_hash_paths, hash_results):
384+
for path, hash in zip(need_sample_hash_paths, hash_results):
385385
if hash is None:
386386
del path_media_map[path]
387387
else:
@@ -394,9 +394,10 @@ def get_fs_duplicates(args) -> List[dict]:
394394
sample_hash_groups[m["hash"]].append(m)
395395
sample_hash_groups = [l for l in sample_hash_groups.values() if len(l) > 1]
396396

397+
sample_hash_paths = {d["path"] for g in sample_hash_groups for d in g}
397398
log.info(
398399
"Got %s sample-hash duplicates (%s groups). Doing full hash comparison...",
399-
len(list(iterables.flatten(sample_hash_groups))),
400+
len(sample_hash_paths),
400401
len(sample_hash_groups),
401402
)
402403

0 commit comments

Comments
 (0)