@@ -357,7 +357,6 @@ def get_fs_duplicates(args) -> List[dict]:
357
357
, m1.size DESC
358
358
, m1.time_modified DESC
359
359
, m1.time_created DESC
360
- , m1.duration DESC
361
360
, m1.path DESC
362
361
"""
363
362
media = list (args .db .query (query , args .filter_bindings ))
@@ -367,26 +366,28 @@ def get_fs_duplicates(args) -> List[dict]:
367
366
size_groups [m ["size" ]].append (m )
368
367
size_groups = [l for l in size_groups .values () if len (l ) > 1 ]
369
368
370
- size_paths = [ d ["path" ] for g in size_groups for d in g ]
369
+ size_paths = { d ["path" ] for g in size_groups for d in g }
371
370
media = [d for d in media if d ["path" ] in size_paths ]
372
371
log .info (
373
372
"Got %s size duplicates (%s groups). Doing sample-hash comparison..." ,
374
373
len (size_paths ),
375
374
len (size_groups ),
376
375
)
377
376
377
+ path_media_map = {d ["path" ]: d for d in media }
378
+
378
379
sample_hash_paths = [d ["path" ] for d in media if not d .get ("hash" )]
379
- with ThreadPoolExecutor ( max_workers = 20 ) as pool :
380
- hash_results = list ( pool . map ( sample_hash . sample_hash_file , sample_hash_paths ))
381
- for path , hash in zip ( sample_hash_paths , hash_results ):
382
- for m in media :
383
- if m [ "path" ] == path :
384
- if hash is None :
385
- media = [ d for d in media if d [ "path" ] != path ]
386
- else :
387
- m ["hash" ] = hash
388
- args .db ["media" ].upsert (m , pk = ["path" ], alter = True ) # save sample-hash back to db
389
- break
380
+ if sample_hash_paths :
381
+ with ThreadPoolExecutor ( max_workers = 20 ) as pool :
382
+ hash_results = list ( pool . map ( sample_hash . sample_hash_file , sample_hash_paths ))
383
+
384
+ for path , hash in zip ( sample_hash_paths , hash_results ) :
385
+ if hash is None :
386
+ del path_media_map [ path ]
387
+ else :
388
+ path_media_map [ path ] ["hash" ] = hash
389
+ args .db ["media" ].upsert (path_media_map [ path ] , pk = ["path" ], alter = True ) # save sample-hash back to db
390
+ media = [ path_media_map [ d [ 'path' ]] for d in media if d [ 'path' ] in path_media_map ]
390
391
391
392
sample_hash_groups = defaultdict (list )
392
393
for m in media :
@@ -399,10 +400,6 @@ def get_fs_duplicates(args) -> List[dict]:
399
400
len (sample_hash_groups ),
400
401
)
401
402
402
- size_map = {}
403
- for m in media :
404
- size_map [m ["path" ]] = m ["size" ]
405
-
406
403
dup_media = []
407
404
for g in sample_hash_groups :
408
405
check_paths = [d ["path" ] for d in g ]
@@ -415,7 +412,7 @@ def get_fs_duplicates(args) -> List[dict]:
415
412
if len (paths ) > 1 :
416
413
keep_path = paths [0 ]
417
414
dup_media .extend (
418
- {"keep_path" : keep_path , "duplicate_path" : p , "duplicate_size" : size_map [keep_path ]}
415
+ {"keep_path" : keep_path , "duplicate_path" : p , "duplicate_size" : path_media_map [keep_path ][ 'size' ]}
419
416
for p in paths [1 :]
420
417
)
421
418
0 commit comments