Skip to content

Commit 2aee3e3

Browse files
committed
dedupe: add --fs option
1 parent 47a4754 commit 2aee3e3

File tree

2 files changed

+87
-10
lines changed

2 files changed

+87
-10
lines changed

xklb/fs_extract.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,7 @@ def extract_metadata(mp_args, path) -> Optional[Dict[str, int]]:
200200
else:
201201
log.debug(f"{timer()-start} {path}")
202202

203-
if getattr(mp_args, "hash", False):
204-
# TODO: it would be better if this was saved to and checked against an external global file
203+
if getattr(mp_args, "hash", False) and media['type'] != "directory" and media['size'] > 0:
205204
media["hash"] = sample_hash.sample_hash_file(path)
206205

207206
if getattr(mp_args, "move", False) and not file_utils.is_file_open(path):

xklb/media/dedupe.py

+86-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import argparse, os, re, tempfile
2+
from collections import defaultdict
3+
from concurrent.futures import ThreadPoolExecutor
24
from copy import deepcopy
35
from pathlib import Path
46
from typing import List
@@ -7,6 +9,7 @@
79

810
from xklb import db_media, usage
911
from xklb.media import media_printer
12+
from xklb.scripts import sample_compare, sample_hash
1013
from xklb.utils import consts, db_utils, devices, file_utils, iterables, objects, strings
1114
from xklb.utils.consts import DBType
1215
from xklb.utils.log_utils import log
@@ -54,6 +57,7 @@ def parse_args() -> argparse.Namespace:
5457
)
5558
profile.add_argument(
5659
"--filesystem",
60+
"--fs",
5761
action="store_const",
5862
dest="profile",
5963
const=DBType.filesystem,
@@ -332,6 +336,87 @@ def get_duration_duplicates(args) -> List[dict]:
332336
return media
333337

334338

339+
def get_fs_duplicates(args) -> List[dict]:
340+
m_columns = db_utils.columns(args, "media")
341+
query = f"""
342+
SELECT
343+
path
344+
, size
345+
{', hash' if 'hash' in m_columns else ''}
346+
FROM
347+
{args.table} m1
348+
WHERE 1=1
349+
and coalesce(m1.time_deleted,0) = 0
350+
and m1.size > 0
351+
{" ".join(args.filter_sql)}
352+
ORDER BY 1=1
353+
, length(m1.path)-length(REPLACE(m1.path, '{os.sep}', '')) DESC
354+
, length(m1.path)-length(REPLACE(m1.path, '.', ''))
355+
, length(m1.path)
356+
, m1.size DESC
357+
, m1.time_modified DESC
358+
, m1.time_created DESC
359+
, m1.duration DESC
360+
, m1.path DESC
361+
"""
362+
media = list(args.db.query(query, args.filter_bindings))
363+
364+
size_groups = defaultdict(list)
365+
for m in media:
366+
size_groups[m['size']].append(m)
367+
size_groups = [l for l in size_groups.values() if len(l) > 1]
368+
369+
log.info(
370+
'Got %s size matches (%s dup groups). Doing sample-hash comparison...',
371+
len(list(iterables.flatten(size_groups))),
372+
len(size_groups),
373+
)
374+
375+
sample_hash_paths = [d['path'] for d in media if not d.get('hash')]
376+
with ThreadPoolExecutor(max_workers=20) as pool:
377+
hash_results = list(pool.map(sample_hash.sample_hash_file, sample_hash_paths))
378+
for path, hash in zip(sample_hash_paths, hash_results):
379+
for m in media:
380+
if m['path'] == path:
381+
m['hash'] = hash
382+
args.db["media"].upsert(m, pk=["path"], alter=True) # save sample-hash back to db
383+
384+
sample_hash_groups = defaultdict(list)
385+
for m in media:
386+
sample_hash_groups[m['hash']].append(m)
387+
sample_hash_groups = [l for l in sample_hash_groups.values() if len(l) > 1]
388+
389+
log.info(
390+
'Got %s sample-hash matches (%s dup groups). Doing full hash comparison...',
391+
len(list(iterables.flatten(sample_hash_groups))),
392+
len(sample_hash_groups),
393+
)
394+
395+
size_map = {}
396+
for m in media:
397+
size_map[m['path']] = m['size']
398+
399+
dup_media = []
400+
for g in sample_hash_groups:
401+
check_paths = [d['path'] for d in g]
402+
with ThreadPoolExecutor(max_workers=5) as pool:
403+
hash_results = list(pool.map(sample_compare.full_hash_file, check_paths))
404+
hash_groups = defaultdict(list)
405+
for path, hash in zip(check_paths, hash_results):
406+
hash_groups[hash].append(path)
407+
for paths in hash_groups.values():
408+
if len(paths) > 1:
409+
keep_path = paths[0]
410+
dup_media.extend(
411+
{'keep_path': keep_path, 'duplicate_path': p, 'duplicate_size': size_map[keep_path]}
412+
for p in paths[1:]
413+
)
414+
415+
# TODO: update false-positive sample-hash matches? probably no because then future sample-hash duplicates won't match
416+
417+
return dup_media
418+
419+
335420
def filter_split_files(paths):
336421
pattern = r"\.\d{3,5}\."
337422
return filter(lambda x: not re.search(pattern, x), paths)
@@ -349,14 +434,7 @@ def dedupe_media() -> None:
349434
elif args.profile == "duration":
350435
duplicates = get_duration_duplicates(args)
351436
elif args.profile == DBType.filesystem:
352-
print(
353-
"""
354-
You should use `rmlint` instead:
355-
356-
$ rmlint --progress --partial-hidden --rank-by dOma
357-
""",
358-
)
359-
return
437+
duplicates = get_fs_duplicates(args)
360438
elif args.profile == DBType.image:
361439
print(
362440
"""

0 commit comments

Comments
 (0)