Skip to content

Commit c776a2b

Browse files
authored
add ratio before the bar (#22)
* refactory: load hash * refactory and add ratio before the bar
1 parent d045111 commit c776a2b

File tree

4 files changed

+87
-67
lines changed

4 files changed

+87
-67
lines changed

chunkdup/chunkdiff.py

Lines changed: 29 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import argparse
22
import sys
3-
from difflib import SequenceMatcher
43
from itertools import groupby
54
from math import ceil
65

6+
from .diff import find_diff
77
from .sums import Chunksums
88

99

@@ -18,25 +18,6 @@
1818
END = "\033[0m"
1919

2020

21-
def find_diff(chunks1, sizes1, chunks2, sizes2):
22-
s = SequenceMatcher(a=chunks1, b=chunks2)
23-
diff = []
24-
total = 0
25-
tag_map = {
26-
"equal": ["=", "="],
27-
"replace": ["-", "+"],
28-
"delete": ["-", " "],
29-
"insert": [" ", "+"],
30-
}
31-
for tag, i1, i2, j1, j2 in s.get_opcodes():
32-
size1 = sum([s for s in sizes1[i1:i2]])
33-
size2 = sum([s for s in sizes2[j1:j2]])
34-
total += max(size1, size2)
35-
diff.append(tag_map[tag] + [size1, size2])
36-
37-
return total, diff
38-
39-
4021
def fill_line(bar_width, total, diff):
4122
zoom = bar_width / total
4223

@@ -66,12 +47,13 @@ def get_bar_layer(chunksums1, chunksums2, path1, path2, bar_width=40):
6647
f1 = chunksums1.get_file(path1)
6748
f2 = chunksums2.get_file(path2)
6849

69-
total, diff = find_diff(f1.hashes, f1.sizes, f2.hashes, f2.sizes)
50+
total, ratio, diff = find_diff(f1.hashes, f2.hashes, f1.sizes, f2.sizes)
7051
line1, line2 = fill_line(bar_width, total, diff)
71-
return line1, line2, f1.size, f2.size
52+
return ratio, line1, line2, f1.size, f2.size
7253

7354

7455
def print_2lines_bar(
56+
ratio,
7557
line1,
7658
line2,
7759
filesize1,
@@ -83,13 +65,13 @@ def print_2lines_bar(
8365
"""
8466
>>> line1 = ['-----', '==', '-----', '===']
8567
>>> line2 = ['++', ' ', '==', '+', ' ', '===']
86-
>>> print_2lines_bar(line1, line2, 100, 70, color=False)
87-
100 -----==-----===
88-
70 ++ ==+ ===
89-
>>> print_2lines_bar(line1, line2, 100, 70)
68+
>>> print_2lines_bar(0.5, line1, line2, 100, 70, color=False)
69+
50.00% 100 -----==-----===
70+
70 ++ ==+ ===
71+
>>> print_2lines_bar(0.5, line1, line2, 100, 70)
9072
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
91-
100 ...
92-
70 ...
73+
50.00% 100 ...
74+
70 ...
9375
"""
9476

9577
def colorful(line):
@@ -105,15 +87,17 @@ def colorful(line):
10587
line1 = colorful(line1)
10688
line2 = colorful(line2)
10789

108-
for size, line in ((filesize1, line1), (filesize2, line2)):
90+
percent = f"{ratio * 100:>6.2f}%"
91+
for pre, size, line in ((percent, filesize1, line1), ("", filesize2, line2)):
10992
print(
110-
"{:>10} {}".format(size, "".join(line)),
93+
"{:>7s} {:>6} {}".format(pre, size, "".join(line)),
11194
file=output or sys.stdout,
11295
flush=True,
11396
)
11497

11598

11699
def print_1line_bar(
100+
ratio,
117101
line1,
118102
line2,
119103
filesize1,
@@ -125,11 +109,11 @@ def print_1line_bar(
125109
"""
126110
>>> line1 = ['-----', '==', ' ', '===']
127111
>>> line2 = ['++', ' ', '==', '+++++', '===']
128-
>>> print_1line_bar(line1, line2, 100, 70, color=False)
129-
▀100 ▄70 ██▀▀▀▒▒▄▄▄▄▄▒▒▒
130-
>>> print_1line_bar(line1, line2, 100, 70, color=True)
112+
>>> print_1line_bar(0.6, line1, line2, 100, 70, color=False)
113+
60.00% ▀100 ▄70 ██▀▀▀▒▒▄▄▄▄▄▒▒▒
114+
>>> print_1line_bar(0.6, line1, line2, 100, 70, color=True)
131115
... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
132-
▀100 ▄70 ...
116+
60.00% ▀100 ▄70 ...
133117
"""
134118

135119
pairs = list("".join(x) for x in zip("".join(line1), "".join(line2)))
@@ -156,7 +140,8 @@ def print_1line_bar(
156140
bar.append(item)
157141

158142
print(
159-
"▀{} ▄{} {}".format(
143+
"{:>6.2f}% ▀{} ▄{} {}".format(
144+
ratio * 100,
160145
filesize1,
161146
filesize2,
162147
"".join(bar),
@@ -188,13 +173,13 @@ def print_diff(
188173
>>> a = Chunksums.parse(open(f1.name))
189174
>>> b = Chunksums.parse(open(f2.name))
190175
>>> print_diff(a, b, './a', './b', color=False)
191-
▀35 ▄35 ▀▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒█████
176+
57.14% ▀35 ▄35 ▀▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒▄▄▄▄▄▒▒▒▒▒█████
192177
>>> print_diff(a, b, './a', './b', color=False, oneline=False)
193-
35 ---------========= ===== =====-----
194-
35 =========+++++=====+++++=====+++++
178+
57.14% 35 ---------========= ===== =====-----
179+
35 =========+++++=====+++++=====+++++
195180
"""
196181

197-
line1, line2, filesize1, filesize2 = get_bar_layer(
182+
ratio, line1, line2, filesize1, filesize2 = get_bar_layer(
198183
chunksums1,
199184
chunksums2,
200185
path1,
@@ -206,6 +191,7 @@ def print_diff(
206191
else:
207192
print_func = print_2lines_bar
208193
print_func(
194+
ratio,
209195
line1,
210196
line2,
211197
filesize1,
@@ -250,14 +236,14 @@ def main():
250236
>>> s = f.name
251237
>>> sys.argv = ['chunkdiff', '-s', s, '-s', s, './a', './b', '--nocolor']
252238
>>> main()
253-
▀45 ▄45 ▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒████▄▄▄▄▄▄▄▒▒▒▒████
239+
55.56% ▀45 ▄45 ▀▀▀▀▀▀▀▀▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒████▄▄▄▄▄▄▄▒▒▒▒████
254240
>>> sys.argv = ['chunkdiff', '-s', s, './a', './b', '-n', '-w', '10']
255241
>>> main()
256-
▀45 ▄45 ▀▀▒▒▒▒█▄▄▒█
242+
55.56% ▀45 ▄45 ▀▀▒▒▒▒█▄▄▒█
257243
>>> sys.argv = ['chunkdiff', '-s', s, './a', './b', '-n', '-b', 'twolines']
258244
>>> main()
259-
45 --------===============---- ====----
260-
45 ===============+++++++++++====++++
245+
55.56% 45 --------===============---- ====----
246+
45 ===============+++++++++++====++++
261247
262248
>>> sys.argv = ['chunkdiff', '-s', s, './bad', './beef']
263249
>>> try:

chunkdup/chunkdup.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,25 @@
22
import argparse
33
import signal
44
import sys
5-
from difflib import SequenceMatcher
65

6+
from .diff import find_diff
77
from .sums import Chunksums
88

99

1010
def diff_ratio(a, b, sizes1, sizes2):
1111
"""
1212
>>> sizes = {'a': 10, 'b': 10, 'c': 20}
13-
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], sizes, sizes)
13+
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'],
14+
... [10, 10, 10, 10], [10, 10, 10, 10])
1415
1.0
15-
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'], sizes, sizes)
16+
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'],
17+
... [10, 10, 10, 10], [10, 10, 10, 10])
1618
0.75
17-
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'], sizes, sizes)
19+
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'],
20+
... [10, 10, 10, 10], [10, 20, 10])
1821
0.5
1922
"""
20-
matches = 0
21-
for tag, i1, i2, _, _ in SequenceMatcher(a=a, b=b).get_opcodes():
22-
if tag != "equal":
23-
continue
24-
matches += sum(
25-
[sizes1.get(chunk, 0) or sizes2.get(chunk, 0) for chunk in a[i1:i2]],
26-
)
27-
size1 = sum([sizes1.get(chunk) for chunk in a])
28-
size2 = sum([sizes2.get(chunk) for chunk in b])
29-
ratio = (2 * matches) / (size1 + size2)
23+
_, ratio, _ = find_diff(a, b, sizes1, sizes2)
3024
return ratio
3125

3226

@@ -60,8 +54,8 @@ def find_dup_files(chunksums1, chunksums2):
6054
ratio = diff_ratio(
6155
f1.hashes,
6256
f2.hashes,
63-
dict(f1.chunks),
64-
dict(f2.chunks),
57+
f1.sizes,
58+
f2.sizes,
6559
)
6660
if f1.path == f2.path and ratio == 1.0:
6761
continue

chunkdup/diff.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from difflib import SequenceMatcher
2+
3+
4+
DIFF_ASCII = {
5+
"equal": ["=", "="],
6+
"replace": ["-", "+"],
7+
"delete": ["-", " "],
8+
"insert": [" ", "+"],
9+
}
10+
11+
12+
def find_diff(chunks1, chunks2, sizes1, sizes2):
13+
"""
14+
>>> sizes = {'a': 10, 'b': 10, 'c': 20}
15+
>>> find_diff(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'],
16+
... [10, 10, 10, 10], [10, 10, 10, 10])
17+
(40, 1.0, [['=', '=', 40, 40]])
18+
>>> find_diff(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'b'],
19+
... [10, 10, 10, 10], [10, 10, 10, 10])
20+
(40, 0.75, [['=', '=', 30, 30], ['-', '+', 10, 10]])
21+
>>> find_diff(['a', 'a', 'a', 'a'], ['c', 'a', 'a'],
22+
... [10, 10, 10, 10], [10, 20, 10])
23+
(60, 0.5, [[' ', '+', 0, 10], ['=', '=', 20, 30], ['-', ' ', 20, 0]])
24+
"""
25+
diff = []
26+
total = 0
27+
matches = 0
28+
s = SequenceMatcher(a=chunks1, b=chunks2)
29+
for tag, i1, i2, j1, j2 in s.get_opcodes():
30+
size1 = sum([s for s in sizes1[i1:i2]])
31+
size2 = sum([s for s in sizes2[j1:j2]])
32+
total += max(size1, size2)
33+
if tag == "equal":
34+
matches += size1
35+
diff.append(DIFF_ASCII[tag] + [size1, size2])
36+
37+
ratio = (2 * matches) / (sum(sizes1) + sum(sizes2))
38+
39+
return total, ratio, diff

chunkdup/sums.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
def load_hash(hash):
2+
if isinstance(hash, bytes):
3+
return hash
4+
else:
5+
return bytes.fromhex(hash)
6+
7+
18
class File:
29
def __init__(self, hash, path, alg_name, chunks):
3-
if isinstance(hash, bytes):
4-
self.hash = hash
5-
else:
6-
self.hash = bytes.fromhex(hash)
10+
self.hash = load_hash(hash)
711
self.path = path
812
self.alg_name = alg_name
913
self._load_chunks(chunks)
@@ -14,10 +18,7 @@ def _load_chunks(self, chunks):
1418
self.hashes, self.sizes = [], []
1519
else:
1620
chunks = list(
17-
[
18-
(hash if isinstance(hash, bytes) else bytes.fromhex(hash), size)
19-
for hash, size in chunks
20-
],
21+
[(load_hash(hash), size) for hash, size in chunks],
2122
)
2223
self.hashes, self.sizes = list(zip(*chunks))
2324

0 commit comments

Comments
 (0)