Skip to content

Commit d045111

Browse files
authored
refactory: add new parser class (#21)
* add new parser * refactory to reduce code complexity * refactory to use new parser
1 parent 21a5372 commit d045111

File tree

4 files changed

+227
-185
lines changed

4 files changed

+227
-185
lines changed

chunkdup/chunkdiff.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from itertools import groupby
55
from math import ceil
66

7-
from .index import get_index
7+
from .sums import Chunksums
88

99

1010
GREY = "\033[90m"
@@ -18,21 +18,6 @@
1818
END = "\033[0m"
1919

2020

21-
class FileNotExists(Exception):
22-
pass
23-
24-
25-
def get_info(chunksums_file, path):
26-
index = get_index(chunksums_file)
27-
try:
28-
id = index._files.get(path).get("id")
29-
except AttributeError:
30-
raise FileNotExists(f"file path not found: {path}")
31-
chunks = index.file_id2chunk[id]
32-
sizes = [index.chunk2size.get(id) for id in chunks]
33-
return chunks, sizes
34-
35-
3621
def find_diff(chunks1, sizes1, chunks2, sizes2):
3722
s = SequenceMatcher(a=chunks1, b=chunks2)
3823
diff = []
@@ -77,15 +62,13 @@ def padding_bar(width, max_width, line):
7762
return line1, line2
7863

7964

80-
def get_bar_layer(chunksums_file1, chunksums_file2, path1, path2, bar_width=40):
81-
chunks1, sizes1 = get_info(chunksums_file1, path1)
82-
chunks2, sizes2 = get_info(chunksums_file2, path2)
65+
def get_bar_layer(chunksums1, chunksums2, path1, path2, bar_width=40):
66+
f1 = chunksums1.get_file(path1)
67+
f2 = chunksums2.get_file(path2)
8368

84-
total, diff = find_diff(chunks1, sizes1, chunks2, sizes2)
85-
filesize1 = sum(sizes1)
86-
filesize2 = sum(sizes2)
69+
total, diff = find_diff(f1.hashes, f1.sizes, f2.hashes, f2.sizes)
8770
line1, line2 = fill_line(bar_width, total, diff)
88-
return line1, line2, filesize1, filesize2
71+
return line1, line2, f1.size, f2.size
8972

9073

9174
def print_2lines_bar(
@@ -184,8 +167,8 @@ def print_1line_bar(
184167

185168

186169
def print_diff(
187-
chunksums_file1,
188-
chunksums_file2,
170+
chunksums1,
171+
chunksums2,
189172
path1,
190173
path2,
191174
output=None,
@@ -197,23 +180,23 @@ def print_diff(
197180
>>> import sys
198181
>>> import tempfile
199182
>>> f1 = tempfile.NamedTemporaryFile()
200-
>>> _ = f1.write(b'sum1 ./a fck0sha2!a:10,b:10,c:10,r:5,s:5,t:5\\n')
183+
>>> _ = f1.write(b'bee1 ./a fck0sha2!aa:10,bb:10,cc:5,dd:5,f1:5\\n')
201184
>>> f1.flush()
202185
>>> f2 = tempfile.NamedTemporaryFile()
203-
>>> _ = f2.write(b'sum2 ./b fck0sha2!b:10,c:10,m:5,r:5,n:5,s:5,z:5\\n')
186+
>>> _ = f2.write(b'bee2 ./b fck0sha2!bb:10,f2:5,cc:5,f3:5,dd:5,f4:5\\n')
204187
>>> f2.flush()
205-
>>> a, b = open(f1.name), open(f2.name)
188+
>>> a = Chunksums.parse(open(f1.name))
189+
>>> b = Chunksums.parse(open(f2.name))
206190
>>> print_diff(a, b, './a', './b', color=False)
207-
▀45 ▄45 ▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▄▄▄▄▒▒▒▒▄▄▄▄▒▒▒▒████
208-
>>> a, b = open(f1.name), open(f2.name)
191+
▀35 ▄35 ▀▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒█████
209192
>>> print_diff(a, b, './a', './b', color=False, oneline=False)
210-
45 --------=============== ==== ====----
211-
45 ===============++++====++++====++++
193+
35 ---------========= ===== =====-----
194+
35 =========+++++=====+++++=====+++++
212195
"""
213196

214197
line1, line2, filesize1, filesize2 = get_bar_layer(
215-
chunksums_file1,
216-
chunksums_file2,
198+
chunksums1,
199+
chunksums2,
217200
path1,
218201
path2,
219202
bar_width=bar_width,
@@ -260,8 +243,8 @@ def main():
260243
>>> import tempfile
261244
>>> f = tempfile.NamedTemporaryFile()
262245
>>> _ = f.write(
263-
... b'sum1 ./a fck0sha2!a:10,b:10,c:10,r:5,s:5,t:5\\n'
264-
... b'sum2 ./b fck0sha2!b:10,c:10,m:10,x:5,s:5,y:5\\n'
246+
... b'bee1 ./a fck0sha2!aa:10,bb:10,cc:10,f1:5,dd:5,f2:5\\n'
247+
... b'bee2 ./b fck0sha2!bb:10,cc:10,f3:10,f4:5,dd:5,f5:5\\n'
265248
... )
266249
>>> f.flush()
267250
>>> s = f.name
@@ -330,15 +313,15 @@ def main():
330313

331314
try:
332315
print_diff(
333-
open(chunksums1),
334-
open(chunksums2),
316+
Chunksums.parse(open(chunksums1)),
317+
Chunksums.parse(open(chunksums2)), # FIXME open same file only once
335318
args.file1,
336319
args.file2,
337320
bar_width=args.barwidth,
338321
color=color,
339322
oneline=oneline,
340323
)
341-
except FileNotExists as e:
324+
except FileNotFoundError as e:
342325
print(e)
343326
sys.exit(1)
344327

chunkdup/chunkdup.py

Lines changed: 59 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import sys
55
from difflib import SequenceMatcher
66

7-
from .index import get_index
7+
from .sums import Chunksums
88

99

1010
def diff_ratio(a, b, sizes1, sizes2):
@@ -30,16 +30,9 @@ def diff_ratio(a, b, sizes1, sizes2):
3030
return ratio
3131

3232

33-
def get_file_info(file_id, index):
34-
index.file_id2chunk
35-
path = index._file_ids[file_id]
36-
size = index._files[path]["size"]
37-
return path, size
38-
39-
40-
def get_dup_file_id_pairs(index1, index2):
41-
chunks1 = index1.chunk2file_id
42-
chunks2 = index2.chunk2file_id
33+
def get_dup_file_id_pairs(chunksums1, chunksums2):
34+
chunks1 = chunksums1.chunk2file_id
35+
chunks2 = chunksums2.chunk2file_id
4336
same_chunks = set(chunks1) & set(chunks2)
4437

4538
same_file_ids1 = {c: chunks1[c] for c in same_chunks}
@@ -50,89 +43,87 @@ def get_dup_file_id_pairs(index1, index2):
5043
ids1 = same_file_ids1[c]
5144
ids2 = same_file_ids2[c]
5245
file_id_pairs.extend([(x, y) for x in ids1 for y in ids2])
53-
return list(set(file_id_pairs))
54-
46+
return sorted(set(file_id_pairs))
5547

56-
def find_dup_files(index1, index2):
57-
file_id_pairs = get_dup_file_id_pairs(index1, index2)
5848

59-
file_ids1 = index1.file_id2chunk
60-
file_ids2 = index2.file_id2chunk
49+
def find_dup_files(chunksums1, chunksums2):
50+
file_id_pairs = get_dup_file_id_pairs(chunksums1, chunksums2)
6151

6252
dups = {}
63-
for f1, f2 in file_id_pairs:
64-
ids1 = file_ids1[f1]
65-
ids2 = file_ids2[f2]
66-
path1, size1 = get_file_info(f1, index1)
67-
path2, size2 = get_file_info(f2, index2)
53+
for hash1, hash2 in file_id_pairs:
54+
f1 = chunksums1.hashes[hash1]
55+
f2 = chunksums2.hashes[hash2]
6856
# avoid compare two files twice
69-
if (size2, path2, size1, path1) in dups:
57+
if (f2.size, f2.path, f1.size, f1.path) in dups:
7058
continue
7159

72-
ratio = diff_ratio(ids1, ids2, index1.chunk2size, index2.chunk2size)
73-
if path1 == path2 and ratio == 1.0:
60+
ratio = diff_ratio(
61+
f1.hashes,
62+
f2.hashes,
63+
dict(f1.chunks),
64+
dict(f2.chunks),
65+
)
66+
if f1.path == f2.path and ratio == 1.0:
7467
continue
75-
dups[(size1, path1, size2, path2)] = ratio
68+
dups[(f1.size, f1.path, f2.size, f2.path)] = ratio
7669
return [[ratio] + list(key) for key, ratio in dups.items()]
7770

7871

79-
def find_dup(chunksum_file1, chunksum_file2):
72+
def find_dup(chunksums1, chunksums2):
8073
"""
8174
>>> import io
8275
>>> from pprint import pprint
8376
>>> chunksum1 = '''
84-
... sum1 /A/1 fck0sha2!a:10,b:10
85-
... sum2 /A/2 fck0sha2!c:10,d:10,e:10
86-
... sum3 /A/3 fck0sha2!f:10,g:10
87-
... sum4 /A/4 fck0sha2!h:10
77+
... bee1 /A/1 fck0sha2!aa:10,bb:10
78+
... bee2 /A/2 fck0sha2!cc:10,dd:10,ee:10
79+
... bee3 /A/3 fck0sha2!ff:10,f0:10
80+
... bee4 /A/4 fck0sha2!f1:10
8881
... '''
8982
>>> chunksum2 = '''
90-
... sum5 /B/1 fck0sha2!m:10,n:10
91-
... sum6 /B/2 fck0sha2!c:10,d:10,f:10
92-
... sum7 /B/3 fck0sha2!f:10,x:10
93-
... sum8 /B/4 fck0sha2!h:10
83+
... bee5 /B/1 fck0sha2!a1:10,a2:10
84+
... bee6 /B/2 fck0sha2!cc:10,dd:10,ff:10
85+
... bee7 /B/3 fck0sha2!ff:10,a3:10
86+
... bee8 /B/4 fck0sha2!f1:10
9487
... '''
95-
>>> file1 = io.StringIO(chunksum1)
96-
>>> file2 = io.StringIO(chunksum2)
88+
>>> file1 = Chunksums.parse(io.StringIO(chunksum1))
89+
>>> file2 = Chunksums.parse(io.StringIO(chunksum2))
9790
>>> pprint(find_dup(file1, file2))
9891
[[1.0, 10, '/A/4', 10, '/B/4'],
9992
[0.6666666666666666, 30, '/A/2', 30, '/B/2'],
10093
[0.5, 20, '/A/3', 20, '/B/3'],
10194
[0.4, 20, '/A/3', 30, '/B/2']]
10295
10396
>>> chunksum_repeat = '''
104-
... sum a fck0sha2!a:1,a:1,a:1,b:2
105-
... sum b fck0sha2!a:1,b:2
106-
... sum c fck0sha2!a:1,a:1,a:1,b:2
97+
... bee1 a fck0sha2!aa:1,aa:1,aa:1,bb:2
98+
... bee2 b fck0sha2!aa:1,bb:2
99+
... bee3 c fck0sha2!aa:1,aa:1,aa:1,bb:2
107100
... '''
108-
>>> file1 = io.StringIO(chunksum_repeat)
109-
>>> file2 = io.StringIO(chunksum_repeat)
101+
>>> file1 = Chunksums.parse(io.StringIO(chunksum_repeat))
102+
>>> file2 = Chunksums.parse(io.StringIO(chunksum_repeat))
110103
>>> pprint(find_dup(file1, file2))
111-
[[1.0, 5, 'c', 5, 'a'], [0.75, 5, 'a', 3, 'b'], [0.75, 3, 'b', 5, 'c']]
104+
[[1.0, 5, 'a', 5, 'c'], [0.75, 5, 'a', 3, 'b'], [0.75, 3, 'b', 5, 'c']]
112105
"""
113-
index1 = get_index(chunksum_file1)
114-
index2 = get_index(chunksum_file2)
115-
dups = sorted(find_dup_files(index1, index2), reverse=True)
106+
dups = sorted(find_dup_files(chunksums1, chunksums2), reverse=True)
116107
return dups
117108

118109

119110
def print_plain_report(dups, output_file):
120111
"""
121112
>>> import io
122113
>>> chunksum1 = '''
123-
... sum1 /A/1 fck0sha2!a:10,b:10
124-
... sum2 /A/2 fck0sha2!c:10,d:10,e:10
125-
... sum3 /A/3 fck0sha2!f:10,g:10
126-
... sum4 /A/4 fck0sha2!h:10
114+
... bee1 /A/1 fck0sha2!aa:10,bb:10
115+
... bee2 /A/2 fck0sha2!cc:10,dd:10,ee:10
116+
... bee3 /A/3 fck0sha2!ff:10,f0:10
117+
... bee4 /A/4 fck0sha2!f1:10
127118
... '''
128119
>>> chunksum2 = '''
129-
... sum5 /B/1 fck0sha2!m:10,n:10
130-
... sum6 /B/2 fck0sha2!c:10,d:10,f:10
131-
... sum7 /B/3 fck0sha2!f:10,x:10
132-
... sum8 /B/4 fck0sha2!h:10
120+
... bee5 /B/1 fck0sha2!a1:10,a2:10
121+
... bee6 /B/2 fck0sha2!cc:10,dd:10,ff:10
122+
... bee7 /B/3 fck0sha2!ff:10,a3:10
123+
... bee8 /B/4 fck0sha2!f1:10
133124
... '''
134-
>>> file1 = io.StringIO(chunksum1)
135-
>>> file2 = io.StringIO(chunksum2)
125+
>>> file1 = Chunksums.parse(io.StringIO(chunksum1))
126+
>>> file2 = Chunksums.parse(io.StringIO(chunksum2))
136127
>>> dups = find_dup(file1, file2)
137128
>>> print_plain_report(dups, sys.stdout)
138129
100.00% /A/4 (10B) /B/4 (10B)
@@ -141,15 +132,15 @@ def print_plain_report(dups, output_file):
141132
40.00% /A/3 (20B) /B/2 (30B)
142133
143134
>>> chunksum_repeat = '''
144-
... sum a fck0sha2!a:1,a:1,a:1,b:2
145-
... sum b fck0sha2!a:1,b:2
146-
... sum c fck0sha2!a:1,a:1,a:1,b:2
135+
... bee1 a fck0sha2!aa:1,aa:1,aa:1,bb:2
136+
... bee2 b fck0sha2!aa:1,bb:2
137+
... bee3 c fck0sha2!aa:1,aa:1,aa:1,bb:2
147138
... '''
148-
>>> file1 = io.StringIO(chunksum_repeat)
149-
>>> file2 = io.StringIO(chunksum_repeat)
139+
>>> file1 = Chunksums.parse(io.StringIO(chunksum_repeat))
140+
>>> file2 = Chunksums.parse(io.StringIO(chunksum_repeat))
150141
>>> dups = find_dup(file1, file2)
151142
>>> print_plain_report(dups, sys.stdout)
152-
100.00% c (5B) a (5B)
143+
100.00% a (5B) c (5B)
153144
75.00% a (5B) b (3B)
154145
75.00% b (3B) c (5B)
155146
"""
@@ -190,10 +181,10 @@ def main():
190181
191182
>>> import tempfile
192183
>>> f1 = tempfile.NamedTemporaryFile()
193-
>>> _ = f1.write(b'sum1 /A/1 fck0sha2!a:10,b:10')
184+
>>> _ = f1.write(b'bee1 /A/1 fck0sha2!aa:10,bb:10')
194185
>>> f1.flush()
195186
>>> f2 = tempfile.NamedTemporaryFile()
196-
>>> _ = f2.write(b'sum2 /B/1 fck0sha2!c:10,b:10')
187+
>>> _ = f2.write(b'bee2 /B/1 fck0sha2!cc:10,bb:10')
197188
>>> f2.flush()
198189
>>> sys.argv = ['chunkdup', f1.name, f2.name]
199190
>>> main()
@@ -218,7 +209,10 @@ def main():
218209
parser.print_help()
219210
sys.exit()
220211

221-
dups = find_dup(open(args.chunksums1), open(args.chunksums2))
212+
dups = find_dup(
213+
Chunksums.parse(open(args.chunksums1)),
214+
Chunksums.parse(open(args.chunksums2)),
215+
)
222216
print_plain_report(dups, sys.stdout)
223217

224218

0 commit comments

Comments
 (0)