44import sys
55from difflib import SequenceMatcher
66
7- from .index import get_index
7+ from .sums import Chunksums
88
99
1010def diff_ratio (a , b , sizes1 , sizes2 ):
@@ -30,16 +30,9 @@ def diff_ratio(a, b, sizes1, sizes2):
3030 return ratio
3131
3232
33- def get_file_info (file_id , index ):
34- index .file_id2chunk
35- path = index ._file_ids [file_id ]
36- size = index ._files [path ]["size" ]
37- return path , size
38-
39-
40- def get_dup_file_id_pairs (index1 , index2 ):
41- chunks1 = index1 .chunk2file_id
42- chunks2 = index2 .chunk2file_id
33+ def get_dup_file_id_pairs (chunksums1 , chunksums2 ):
34+ chunks1 = chunksums1 .chunk2file_id
35+ chunks2 = chunksums2 .chunk2file_id
4336 same_chunks = set (chunks1 ) & set (chunks2 )
4437
4538 same_file_ids1 = {c : chunks1 [c ] for c in same_chunks }
@@ -50,89 +43,87 @@ def get_dup_file_id_pairs(index1, index2):
5043 ids1 = same_file_ids1 [c ]
5144 ids2 = same_file_ids2 [c ]
5245 file_id_pairs .extend ([(x , y ) for x in ids1 for y in ids2 ])
53- return list (set (file_id_pairs ))
54-
46+ return sorted (set (file_id_pairs ))
5547
56- def find_dup_files (index1 , index2 ):
57- file_id_pairs = get_dup_file_id_pairs (index1 , index2 )
5848
59- file_ids1 = index1 . file_id2chunk
60- file_ids2 = index2 . file_id2chunk
49+ def find_dup_files ( chunksums1 , chunksums2 ):
50+ file_id_pairs = get_dup_file_id_pairs ( chunksums1 , chunksums2 )
6151
6252 dups = {}
63- for f1 , f2 in file_id_pairs :
64- ids1 = file_ids1 [f1 ]
65- ids2 = file_ids2 [f2 ]
66- path1 , size1 = get_file_info (f1 , index1 )
67- path2 , size2 = get_file_info (f2 , index2 )
53+ for hash1 , hash2 in file_id_pairs :
54+ f1 = chunksums1 .hashes [hash1 ]
55+ f2 = chunksums2 .hashes [hash2 ]
6856 # avoid compare two files twice
69- if (size2 , path2 , size1 , path1 ) in dups :
57+ if (f2 . size , f2 . path , f1 . size , f1 . path ) in dups :
7058 continue
7159
72- ratio = diff_ratio (ids1 , ids2 , index1 .chunk2size , index2 .chunk2size )
73- if path1 == path2 and ratio == 1.0 :
60+ ratio = diff_ratio (
61+ f1 .hashes ,
62+ f2 .hashes ,
63+ dict (f1 .chunks ),
64+ dict (f2 .chunks ),
65+ )
66+ if f1 .path == f2 .path and ratio == 1.0 :
7467 continue
75- dups [(size1 , path1 , size2 , path2 )] = ratio
68+ dups [(f1 . size , f1 . path , f2 . size , f2 . path )] = ratio
7669 return [[ratio ] + list (key ) for key , ratio in dups .items ()]
7770
7871
79- def find_dup (chunksum_file1 , chunksum_file2 ):
72+ def find_dup (chunksums1 , chunksums2 ):
8073 """
8174 >>> import io
8275 >>> from pprint import pprint
8376 >>> chunksum1 = '''
84- ... sum1 /A/1 fck0sha2!a :10,b :10
85- ... sum2 /A/2 fck0sha2!c :10,d :10,e :10
86- ... sum3 /A/3 fck0sha2!f :10,g :10
87- ... sum4 /A/4 fck0sha2!h :10
77+ ... bee1 /A/1 fck0sha2!aa :10,bb :10
78+ ... bee2 /A/2 fck0sha2!cc :10,dd :10,ee :10
79+ ... bee3 /A/3 fck0sha2!ff :10,f0 :10
80+ ... bee4 /A/4 fck0sha2!f1 :10
8881 ... '''
8982 >>> chunksum2 = '''
90- ... sum5 /B/1 fck0sha2!m :10,n :10
91- ... sum6 /B/2 fck0sha2!c :10,d :10,f :10
92- ... sum7 /B/3 fck0sha2!f :10,x :10
93- ... sum8 /B/4 fck0sha2!h :10
83+ ... bee5 /B/1 fck0sha2!a1 :10,a2 :10
84+ ... bee6 /B/2 fck0sha2!cc :10,dd :10,ff :10
85+ ... bee7 /B/3 fck0sha2!ff :10,a3 :10
86+ ... bee8 /B/4 fck0sha2!f1 :10
9487 ... '''
95- >>> file1 = io.StringIO(chunksum1)
96- >>> file2 = io.StringIO(chunksum2)
88+ >>> file1 = Chunksums.parse( io.StringIO(chunksum1) )
89+ >>> file2 = Chunksums.parse( io.StringIO(chunksum2) )
9790 >>> pprint(find_dup(file1, file2))
9891 [[1.0, 10, '/A/4', 10, '/B/4'],
9992 [0.6666666666666666, 30, '/A/2', 30, '/B/2'],
10093 [0.5, 20, '/A/3', 20, '/B/3'],
10194 [0.4, 20, '/A/3', 30, '/B/2']]
10295
10396 >>> chunksum_repeat = '''
104- ... sum a fck0sha2!a :1,a :1,a :1,b :2
105- ... sum b fck0sha2!a :1,b :2
106- ... sum c fck0sha2!a :1,a :1,a :1,b :2
97+ ... bee1 a fck0sha2!aa :1,aa :1,aa :1,bb :2
98+ ... bee2 b fck0sha2!aa :1,bb :2
99+ ... bee3 c fck0sha2!aa :1,aa :1,aa :1,bb :2
107100 ... '''
108- >>> file1 = io.StringIO(chunksum_repeat)
109- >>> file2 = io.StringIO(chunksum_repeat)
101+ >>> file1 = Chunksums.parse( io.StringIO(chunksum_repeat) )
102+ >>> file2 = Chunksums.parse( io.StringIO(chunksum_repeat) )
110103 >>> pprint(find_dup(file1, file2))
111- [[1.0, 5, 'c ', 5, 'a '], [0.75, 5, 'a', 3, 'b'], [0.75, 3, 'b', 5, 'c']]
104+ [[1.0, 5, 'a ', 5, 'c '], [0.75, 5, 'a', 3, 'b'], [0.75, 3, 'b', 5, 'c']]
112105 """
113- index1 = get_index (chunksum_file1 )
114- index2 = get_index (chunksum_file2 )
115- dups = sorted (find_dup_files (index1 , index2 ), reverse = True )
106+ dups = sorted (find_dup_files (chunksums1 , chunksums2 ), reverse = True )
116107 return dups
117108
118109
119110def print_plain_report (dups , output_file ):
120111 """
121112 >>> import io
122113 >>> chunksum1 = '''
123- ... sum1 /A/1 fck0sha2!a :10,b :10
124- ... sum2 /A/2 fck0sha2!c :10,d :10,e :10
125- ... sum3 /A/3 fck0sha2!f :10,g :10
126- ... sum4 /A/4 fck0sha2!h :10
114+ ... bee1 /A/1 fck0sha2!aa :10,bb :10
115+ ... bee2 /A/2 fck0sha2!cc :10,dd :10,ee :10
116+ ... bee3 /A/3 fck0sha2!ff :10,f0 :10
117+ ... bee4 /A/4 fck0sha2!f1 :10
127118 ... '''
128119 >>> chunksum2 = '''
129- ... sum5 /B/1 fck0sha2!m :10,n :10
130- ... sum6 /B/2 fck0sha2!c :10,d :10,f :10
131- ... sum7 /B/3 fck0sha2!f :10,x :10
132- ... sum8 /B/4 fck0sha2!h :10
120+ ... bee5 /B/1 fck0sha2!a1 :10,a2 :10
121+ ... bee6 /B/2 fck0sha2!cc :10,dd :10,ff :10
122+ ... bee7 /B/3 fck0sha2!ff :10,a3 :10
123+ ... bee8 /B/4 fck0sha2!f1 :10
133124 ... '''
134- >>> file1 = io.StringIO(chunksum1)
135- >>> file2 = io.StringIO(chunksum2)
125+ >>> file1 = Chunksums.parse( io.StringIO(chunksum1) )
126+ >>> file2 = Chunksums.parse( io.StringIO(chunksum2) )
136127 >>> dups = find_dup(file1, file2)
137128 >>> print_plain_report(dups, sys.stdout)
138129 100.00% /A/4 (10B) /B/4 (10B)
@@ -141,15 +132,15 @@ def print_plain_report(dups, output_file):
141132 40.00% /A/3 (20B) /B/2 (30B)
142133
143134 >>> chunksum_repeat = '''
144- ... sum a fck0sha2!a :1,a :1,a :1,b :2
145- ... sum b fck0sha2!a :1,b :2
146- ... sum c fck0sha2!a :1,a :1,a :1,b :2
135+ ... bee1 a fck0sha2!aa :1,aa :1,aa :1,bb :2
136+ ... bee2 b fck0sha2!aa :1,bb :2
137+ ... bee3 c fck0sha2!aa :1,aa :1,aa :1,bb :2
147138 ... '''
148- >>> file1 = io.StringIO(chunksum_repeat)
149- >>> file2 = io.StringIO(chunksum_repeat)
139+ >>> file1 = Chunksums.parse( io.StringIO(chunksum_repeat) )
140+ >>> file2 = Chunksums.parse( io.StringIO(chunksum_repeat) )
150141 >>> dups = find_dup(file1, file2)
151142 >>> print_plain_report(dups, sys.stdout)
152- 100.00% c (5B) a (5B)
143+ 100.00% a (5B) c (5B)
153144 75.00% a (5B) b (3B)
154145 75.00% b (3B) c (5B)
155146 """
@@ -190,10 +181,10 @@ def main():
190181
191182 >>> import tempfile
192183 >>> f1 = tempfile.NamedTemporaryFile()
193- >>> _ = f1.write(b'sum1 /A/1 fck0sha2!a :10,b :10')
184+ >>> _ = f1.write(b'bee1 /A/1 fck0sha2!aa :10,bb :10')
194185 >>> f1.flush()
195186 >>> f2 = tempfile.NamedTemporaryFile()
196- >>> _ = f2.write(b'sum2 /B/1 fck0sha2!c :10,b :10')
187+ >>> _ = f2.write(b'bee2 /B/1 fck0sha2!cc :10,bb :10')
197188 >>> f2.flush()
198189 >>> sys.argv = ['chunkdup', f1.name, f2.name]
199190 >>> main()
@@ -218,7 +209,10 @@ def main():
218209 parser .print_help ()
219210 sys .exit ()
220211
221- dups = find_dup (open (args .chunksums1 ), open (args .chunksums2 ))
212+ dups = find_dup (
213+ Chunksums .parse (open (args .chunksums1 )),
214+ Chunksums .parse (open (args .chunksums2 )),
215+ )
222216 print_plain_report (dups , sys .stdout )
223217
224218
0 commit comments