@@ -401,22 +401,25 @@ def get_fs_duplicates(args) -> List[dict]:
401
401
len (sample_hash_groups ),
402
402
)
403
403
404
+ with ThreadPoolExecutor (max_workers = 20 ) as pool :
405
+ path_hash_map = {
406
+ k : v for k , v in zip (sample_hash_paths , pool .map (sample_compare .full_hash_file , sample_hash_paths ))
407
+ }
408
+
409
+ full_hash_groups = defaultdict (list )
410
+ for path , hash in path_hash_map .items ():
411
+ if hash is not None :
412
+ full_hash_groups [hash ].append (path )
413
+ full_hash_groups = [l for l in full_hash_groups .values () if len (l ) > 1 ]
414
+
404
415
dup_media = []
405
- for g in sample_hash_groups :
406
- check_paths = [d ["path" ] for d in g ]
407
- with ThreadPoolExecutor (max_workers = 5 ) as pool :
408
- hash_results = list (pool .map (sample_compare .full_hash_file , check_paths ))
409
- hash_groups = defaultdict (list )
410
- for path , hash in zip (check_paths , hash_results ):
411
- if hash is not None :
412
- hash_groups [hash ].append (path )
413
- for paths in hash_groups .values ():
414
- if len (paths ) > 1 :
415
- keep_path = paths [0 ]
416
- dup_media .extend (
417
- {"keep_path" : keep_path , "duplicate_path" : p , "duplicate_size" : path_media_map [keep_path ]['size' ]}
418
- for p in paths [1 :]
419
- )
416
+ for hash_group_paths in full_hash_groups :
417
+ paths = [d ['path' ] for d in media if d ['path' ] in hash_group_paths ] # get the correct order from media
418
+ keep_path = paths [0 ]
419
+ dup_media .extend (
420
+ {"keep_path" : keep_path , "duplicate_path" : p , "duplicate_size" : path_media_map [keep_path ]['size' ]}
421
+ for p in paths [1 :]
422
+ )
420
423
421
424
# TODO: update false-positive sample-hash matches? probably no because then future sample-hash duplicates won't match
422
425
0 commit comments