Skip to content

Commit 863da8d

Browse files
committed
pdm dedupe
1 parent 8cd284a commit 863da8d

File tree

2 files changed

+47
-61
lines changed

2 files changed

+47
-61
lines changed

pdm.lock

+32-43
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

xklb/media/dedupe.py

+15-18
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,6 @@ def get_fs_duplicates(args) -> List[dict]:
357357
, m1.size DESC
358358
, m1.time_modified DESC
359359
, m1.time_created DESC
360-
, m1.duration DESC
361360
, m1.path DESC
362361
"""
363362
media = list(args.db.query(query, args.filter_bindings))
@@ -367,26 +366,28 @@ def get_fs_duplicates(args) -> List[dict]:
367366
size_groups[m["size"]].append(m)
368367
size_groups = [l for l in size_groups.values() if len(l) > 1]
369368

370-
size_paths = [d["path"] for g in size_groups for d in g]
369+
size_paths = {d["path"] for g in size_groups for d in g}
371370
media = [d for d in media if d["path"] in size_paths]
372371
log.info(
373372
"Got %s size duplicates (%s groups). Doing sample-hash comparison...",
374373
len(size_paths),
375374
len(size_groups),
376375
)
377376

377+
path_media_map = {d["path"]: d for d in media}
378+
378379
sample_hash_paths = [d["path"] for d in media if not d.get("hash")]
379-
with ThreadPoolExecutor(max_workers=20) as pool:
380-
hash_results = list(pool.map(sample_hash.sample_hash_file, sample_hash_paths))
381-
for path, hash in zip(sample_hash_paths, hash_results):
382-
for m in media:
383-
if m["path"] == path:
384-
if hash is None:
385-
media = [d for d in media if d["path"] != path]
386-
else:
387-
m["hash"] = hash
388-
args.db["media"].upsert(m, pk=["path"], alter=True) # save sample-hash back to db
389-
break
380+
if sample_hash_paths:
381+
with ThreadPoolExecutor(max_workers=20) as pool:
382+
hash_results = list(pool.map(sample_hash.sample_hash_file, sample_hash_paths))
383+
384+
for path, hash in zip(sample_hash_paths, hash_results):
385+
if hash is None:
386+
del path_media_map[path]
387+
else:
388+
path_media_map[path]["hash"] = hash
389+
args.db["media"].upsert(path_media_map[path], pk=["path"], alter=True) # save sample-hash back to db
390+
media = [path_media_map[d['path']] for d in media if d['path'] in path_media_map]
390391

391392
sample_hash_groups = defaultdict(list)
392393
for m in media:
@@ -399,10 +400,6 @@ def get_fs_duplicates(args) -> List[dict]:
399400
len(sample_hash_groups),
400401
)
401402

402-
size_map = {}
403-
for m in media:
404-
size_map[m["path"]] = m["size"]
405-
406403
dup_media = []
407404
for g in sample_hash_groups:
408405
check_paths = [d["path"] for d in g]
@@ -415,7 +412,7 @@ def get_fs_duplicates(args) -> List[dict]:
415412
if len(paths) > 1:
416413
keep_path = paths[0]
417414
dup_media.extend(
418-
{"keep_path": keep_path, "duplicate_path": p, "duplicate_size": size_map[keep_path]}
415+
{"keep_path": keep_path, "duplicate_path": p, "duplicate_size": path_media_map[keep_path]['size']}
419416
for p in paths[1:]
420417
)
421418

0 commit comments

Comments
 (0)