1
1
import argparse , os , re , tempfile
2
+ from collections import defaultdict
3
+ from concurrent .futures import ThreadPoolExecutor
2
4
from copy import deepcopy
3
5
from pathlib import Path
4
6
from typing import List
7
9
8
10
from xklb import db_media , usage
9
11
from xklb .media import media_printer
12
+ from xklb .scripts import sample_compare , sample_hash
10
13
from xklb .utils import consts , db_utils , devices , file_utils , iterables , objects , strings
11
14
from xklb .utils .consts import DBType
12
15
from xklb .utils .log_utils import log
@@ -54,6 +57,7 @@ def parse_args() -> argparse.Namespace:
54
57
)
55
58
profile .add_argument (
56
59
"--filesystem" ,
60
+ "--fs" ,
57
61
action = "store_const" ,
58
62
dest = "profile" ,
59
63
const = DBType .filesystem ,
@@ -332,6 +336,87 @@ def get_duration_duplicates(args) -> List[dict]:
332
336
return media
333
337
334
338
339
+ def get_fs_duplicates (args ) -> List [dict ]:
340
+ m_columns = db_utils .columns (args , "media" )
341
+ query = f"""
342
+ SELECT
343
+ path
344
+ , size
345
+ { ', hash' if 'hash' in m_columns else '' }
346
+ FROM
347
+ { args .table } m1
348
+ WHERE 1=1
349
+ and coalesce(m1.time_deleted,0) = 0
350
+ and m1.size > 0
351
+ { " " .join (args .filter_sql )}
352
+ ORDER BY 1=1
353
+ , length(m1.path)-length(REPLACE(m1.path, '{ os .sep } ', '')) DESC
354
+ , length(m1.path)-length(REPLACE(m1.path, '.', ''))
355
+ , length(m1.path)
356
+ , m1.size DESC
357
+ , m1.time_modified DESC
358
+ , m1.time_created DESC
359
+ , m1.duration DESC
360
+ , m1.path DESC
361
+ """
362
+ media = list (args .db .query (query , args .filter_bindings ))
363
+
364
+ size_groups = defaultdict (list )
365
+ for m in media :
366
+ size_groups [m ['size' ]].append (m )
367
+ size_groups = [l for l in size_groups .values () if len (l ) > 1 ]
368
+
369
+ log .info (
370
+ 'Got %s size matches (%s dup groups). Doing sample-hash comparison...' ,
371
+ len (list (iterables .flatten (size_groups ))),
372
+ len (size_groups ),
373
+ )
374
+
375
+ sample_hash_paths = [d ['path' ] for d in media if not d .get ('hash' )]
376
+ with ThreadPoolExecutor (max_workers = 20 ) as pool :
377
+ hash_results = list (pool .map (sample_hash .sample_hash_file , sample_hash_paths ))
378
+ for path , hash in zip (sample_hash_paths , hash_results ):
379
+ for m in media :
380
+ if m ['path' ] == path :
381
+ m ['hash' ] = hash
382
+ args .db ["media" ].upsert (m , pk = ["path" ], alter = True ) # save sample-hash back to db
383
+
384
+ sample_hash_groups = defaultdict (list )
385
+ for m in media :
386
+ sample_hash_groups [m ['hash' ]].append (m )
387
+ sample_hash_groups = [l for l in sample_hash_groups .values () if len (l ) > 1 ]
388
+
389
+ log .info (
390
+ 'Got %s sample-hash matches (%s dup groups). Doing full hash comparison...' ,
391
+ len (list (iterables .flatten (sample_hash_groups ))),
392
+ len (sample_hash_groups ),
393
+ )
394
+
395
+ size_map = {}
396
+ for m in media :
397
+ size_map [m ['path' ]] = m ['size' ]
398
+
399
+ dup_media = []
400
+ for g in sample_hash_groups :
401
+ check_paths = [d ['path' ] for d in g ]
402
+ with ThreadPoolExecutor (max_workers = 5 ) as pool :
403
+ hash_results = list (pool .map (sample_compare .full_hash_file , check_paths ))
404
+ hash_groups = defaultdict (list )
405
+ for path , hash in zip (check_paths , hash_results ):
406
+ hash_groups [hash ].append (path )
407
+ for paths in hash_groups .values ():
408
+ if len (paths ) > 1 :
409
+ keep_path = paths [0 ]
410
+ dup_media .extend (
411
+ {'keep_path' : keep_path , 'duplicate_path' : p , 'duplicate_size' : size_map [keep_path ]}
412
+ for p in paths [1 :]
413
+ )
414
+
415
+ # TODO: update false-positive sample-hash matches? probably no because then future sample-hash duplicates won't match
416
+
417
+ return dup_media
418
+
419
+
335
420
def filter_split_files (paths ):
336
421
pattern = r"\.\d{3,5}\."
337
422
return filter (lambda x : not re .search (pattern , x ), paths )
@@ -349,14 +434,7 @@ def dedupe_media() -> None:
349
434
elif args .profile == "duration" :
350
435
duplicates = get_duration_duplicates (args )
351
436
elif args .profile == DBType .filesystem :
352
- print (
353
- """
354
- You should use `rmlint` instead:
355
-
356
- $ rmlint --progress --partial-hidden --rank-by dOma
357
- """ ,
358
- )
359
- return
437
+ duplicates = get_fs_duplicates (args )
360
438
elif args .profile == DBType .image :
361
439
print (
362
440
"""
0 commit comments