Skip to content

Commit 63f2505

Browse files
authored
Entropy validation refactoring (#700)
* Entropy refactoring * fix * reduce github_pat_ pattern * docs * lxml==5.3.2 * import optimization * BM ref actual
1 parent c8ddb62 commit 63f2505

36 files changed

+3313
-3283
lines changed

.ci/benchmark.txt

Lines changed: 25 additions & 25 deletions
Large diffs are not rendered by default.

.github/workflows/benchmark.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
3232
with:
3333
repository: Samsung/CredData
34-
ref: 4e9ebe54a749df6f1737737313b3b834a8a06429
34+
ref: f3f609275a4147b8b1b7075eef5af318ba8c8a2f
3535

3636
- name: Markup hashing
3737
run: |
@@ -87,7 +87,7 @@ jobs:
8787
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
8888
with:
8989
repository: Samsung/CredData
90-
ref: 4e9ebe54a749df6f1737737313b3b834a8a06429
90+
ref: f3f609275a4147b8b1b7075eef5af318ba8c8a2f
9191

9292
- name: Markup hashing
9393
run: |
@@ -190,7 +190,7 @@ jobs:
190190
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
191191
with:
192192
repository: Samsung/CredData
193-
ref: 4e9ebe54a749df6f1737737313b3b834a8a06429
193+
ref: f3f609275a4147b8b1b7075eef5af318ba8c8a2f
194194

195195
- name: Markup hashing
196196
run: |
@@ -378,7 +378,7 @@ jobs:
378378
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
379379
with:
380380
repository: Samsung/CredData
381-
ref: 4e9ebe54a749df6f1737737313b3b834a8a06429
381+
ref: f3f609275a4147b8b1b7075eef5af318ba8c8a2f
382382

383383
- name: Markup hashing
384384
run: |

credsweeper/common/morpheme_checklist.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,7 @@ mbler
885885
mean
886886
measur
887887
medi
888-
medusa
888+
medus
889889
meet
890890
mem_
891891
memb
@@ -925,7 +925,7 @@ month
925925
morp
926926
mory
927927
mote
928-
motorola
928+
motor
929929
mount
930930
move
931931
mpeg
@@ -1005,6 +1005,7 @@ origin
10051005
orithm
10061006
ormat
10071007
orph
1008+
otorola
10081009
ottle
10091010
ously
10101011
out
@@ -1485,6 +1486,7 @@ up_
14851486
updat
14861487
upgrade
14871488
url
1489+
usa
14881490
usb
14891491
use
14901492
usin

credsweeper/credentials/line_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
393393
cut_pos = StartEnd(self.variable_start if 0 <= self.variable_start else self.value_start,
394394
self.value_end) if subtext else None
395395
if isinstance(self.value, str):
396-
entropy = round(Util.get_shannon_entropy(self.value, string.printable), 5)
396+
entropy = round(Util.get_shannon_entropy(self.value), 5)
397397
else:
398398
entropy = None
399399
full_output = {

credsweeper/filters/value_azure_token_check.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import contextlib
22
import json
33

4-
from credsweeper.common.constants import Chars
54
from credsweeper.config import Config
65
from credsweeper.credentials import LineData
76
from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -45,7 +44,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
4544
# must be all parts in payload
4645
return True
4746
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(parts[2]))
48-
entropy = Util.get_shannon_entropy(parts[2], Chars.BASE64URL_CHARS.value)
47+
entropy = Util.get_shannon_entropy(parts[2])
4948
# good signature has to be like random bytes
5049
return entropy < min_entropy
5150

credsweeper/filters/value_base64_part_check.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import contextlib
22
import re
33
import statistics
4+
from itertools import takewhile
45

56
from credsweeper.common.constants import Chars
67
from credsweeper.config import Config
@@ -16,8 +17,8 @@ class ValueBase64PartCheck(Filter):
1617
Check that candidate is NOT a part of base64 long line
1718
"""
1819

19-
base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}")
20-
base64_set = set(Chars.BASE64STDPAD_CHARS.value)
20+
base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}$")
21+
base64_char_set = set(Chars.BASE64STDPAD_CHARS.value + '\\')
2122

2223
def __init__(self, config: Config = None) -> None:
2324
pass
@@ -64,38 +65,46 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
6465
elif right_end - left_start >= 2 * len_value:
6566
# simple analysis for data too large to yield sensible insights
6667
part_set = set(line[left_start:right_end])
67-
if not part_set.difference(self.base64_set):
68+
if not part_set.difference(ValueBase64PartCheck.base64_char_set):
6869
# obvious case: all characters are base64 standard
6970
return True
7071

71-
left_part = line[left_start:line_data.value_start]
72-
len_left = len(left_part)
73-
right_part = line[line_data.value_end:right_end]
74-
len_right = len(right_part)
72+
left_part = ''.join(
73+
takewhile(lambda x: x in ValueBase64PartCheck.base64_char_set,
74+
reversed(line[left_start:line_data.value_start])))
75+
76+
right_part = ''.join(
77+
takewhile(lambda x: x in ValueBase64PartCheck.base64_char_set, line[line_data.value_end:right_end]))
7578

7679
min_entropy_value = ValueEntropyBase64Check.get_min_data_entropy(len_value)
77-
value_entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
7880

79-
if ValueEntropyBase64Check.min_length < len_left:
80-
left_entropy = Util.get_shannon_entropy(left_part, Chars.BASE64STD_CHARS.value)
81-
if len_left < len_value:
82-
left_entropy *= len_value / len_left
83-
else:
84-
left_entropy = min_entropy_value
81+
left_entropy = Util.get_shannon_entropy(left_part)
82+
value_entropy = Util.get_shannon_entropy(value)
83+
right_entropy = Util.get_shannon_entropy(right_part)
84+
common = left_part + value + right_part
85+
common_entropy = Util.get_shannon_entropy(common)
86+
min_entropy_common = ValueEntropyBase64Check.get_min_data_entropy(len(common))
87+
if min_entropy_common < common_entropy:
88+
return True
8589

86-
if ValueEntropyBase64Check.min_length < len_right:
87-
right_entropy = Util.get_shannon_entropy(right_part, Chars.BASE64STD_CHARS.value)
88-
if len_right < len_value:
89-
left_entropy *= len_right / len_left
90+
if left_entropy and right_entropy:
91+
data = [left_entropy, value_entropy, right_entropy, min_entropy_value, common_entropy]
92+
elif left_entropy and not right_entropy:
93+
data = [left_entropy, value_entropy, min_entropy_value, min_entropy_value, common_entropy]
94+
elif not left_entropy and right_entropy:
95+
data = [value_entropy, right_entropy, min_entropy_value, min_entropy_value, common_entropy]
9096
else:
91-
right_entropy = min_entropy_value
97+
return False
9298

93-
data = [left_entropy, value_entropy, right_entropy, min_entropy_value]
9499
avg = statistics.mean(data)
95100
stdev = statistics.stdev(data, avg)
96101
avg_min = avg - 1.1 * stdev
97-
if avg_min <= left_entropy and avg_min <= right_entropy:
102+
if (0. == left_entropy or avg_min < left_entropy or left_entropy < value_entropy < right_entropy) \
103+
and (
104+
0. == right_entropy or avg_min < right_entropy or right_entropy < value_entropy < left_entropy):
98105
# high entropy of bound parts looks like a part of base64 long line
99106
return True
107+
else:
108+
return False
100109

101110
return False

credsweeper/filters/value_discord_bot_check.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import contextlib
22

3-
from credsweeper.common.constants import Chars
43
from credsweeper.config import Config
54
from credsweeper.credentials import LineData
65
from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -32,7 +31,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
3231
id_part = line_data.value[:dot_separator_index]
3332
discord_id = int(Util.decode_base64(id_part, padding_safe=True, urlsafe_detect=True))
3433
entropy_part = line_data.value[dot_separator_index:]
35-
entropy = Util.get_shannon_entropy(entropy_part, Chars.BASE64URL_CHARS.value)
34+
entropy = Util.get_shannon_entropy(entropy_part)
3635
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(entropy_part))
3736
if 1000 <= discord_id and min_entropy <= entropy:
3837
return False
Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,26 @@
11
import math
2+
from functools import cache
23

3-
from credsweeper.common.constants import Chars
44
from credsweeper.config import Config
5-
from credsweeper.credentials import LineData
6-
from credsweeper.file_handler.analysis_target import AnalysisTarget
7-
from credsweeper.filters import Filter
8-
from credsweeper.utils import Util
5+
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
96

107

11-
class ValueEntropyBase32Check(Filter):
12-
"""Check that candidate have Shanon Entropy (for [a-z0-9])"""
8+
class ValueEntropyBase32Check(ValueEntropyBaseCheck):
9+
"""Base32 entropy check"""
1310

1411
def __init__(self, config: Config = None) -> None:
15-
pass
12+
super().__init__(config)
1613

1714
@staticmethod
15+
@cache
1816
def get_min_data_entropy(x: int) -> float:
1917
"""Returns average entropy for size of random data. Precalculated data is applied for speedup"""
20-
if 16 == x:
21-
y = 3.46
22-
elif 10 <= x:
23-
# approximation does not exceed stdev
24-
y = 0.64 * math.log2(x) + 0.9
18+
if 8 <= x < 17:
19+
y = 0.80569236 * math.log2(x) + 0.13439734
20+
elif 17 <= x < 33:
21+
y = 0.66350481 * math.log2(x) + 0.71143862
22+
elif 33 <= x:
23+
y = 4.04
2524
else:
2625
y = 0
2726
return y
28-
29-
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
30-
"""Run filter checks on received credential candidate data 'line_data'.
31-
32-
Args:
33-
line_data: credential candidate data
34-
target: multiline target from which line data was obtained
35-
36-
Return:
37-
True, if need to filter candidate and False if left
38-
39-
"""
40-
entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE32_CHARS.value)
41-
min_entropy = ValueEntropyBase32Check.get_min_data_entropy(len(line_data.value))
42-
return min_entropy > entropy or 0 == min_entropy
Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,27 @@
11
import math
2+
from functools import cache
23

3-
from credsweeper.common.constants import Chars
44
from credsweeper.config import Config
5-
from credsweeper.credentials import LineData
6-
from credsweeper.file_handler.analysis_target import AnalysisTarget
7-
from credsweeper.filters import Filter
8-
from credsweeper.utils import Util
5+
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
96

107

11-
class ValueEntropyBase36Check(Filter):
12-
"""Check that candidate have Shanon Entropy (for [a-z0-9])"""
8+
class ValueEntropyBase36Check(ValueEntropyBaseCheck):
9+
"""Base36 entropy check"""
1310

1411
def __init__(self, config: Config = None) -> None:
15-
pass
12+
super().__init__(config)
1613

1714
@staticmethod
15+
@cache
1816
def get_min_data_entropy(x: int) -> float:
1917
"""Returns minimal entropy for size of random data. Precalculated data is applied for speedup"""
2018
if 15 == x:
21-
y = 3.43
22-
elif 24 == x:
23-
y = 3.91
24-
elif 25 == x:
25-
y = 3.95
26-
elif 10 <= x:
27-
# approximation does not exceed standard deviation
28-
y = 0.7 * math.log2(x) + 0.7
19+
# workaround for Dropbox App secret
20+
y = 3.374
21+
elif 10 <= x < 26:
22+
y = 0.731566857 * math.log2(x) + 0.474132
23+
elif 26 <= x:
24+
y = 3.9
2925
else:
3026
y = 0
3127
return y
32-
33-
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
34-
"""Run filter checks on received credential candidate data 'line_data'.
35-
36-
Args:
37-
line_data: credential candidate data
38-
target: multiline target from which line data was obtained
39-
40-
Return:
41-
True, if need to filter candidate and False if left
42-
43-
"""
44-
entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE36_CHARS.value)
45-
min_entropy = ValueEntropyBase36Check.get_min_data_entropy(len(line_data.value))
46-
return min_entropy > entropy or 0 == min_entropy
Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,34 @@
11
import math
2+
from functools import cache
23

3-
from credsweeper.common.constants import Chars, ENTROPY_LIMIT_BASE64
44
from credsweeper.config import Config
5-
from credsweeper.credentials import LineData
6-
from credsweeper.file_handler.analysis_target import AnalysisTarget
7-
from credsweeper.filters import Filter
8-
from credsweeper.utils import Util
5+
from credsweeper.filters.value_entropy_base_check import ValueEntropyBaseCheck
96

107

11-
class ValueEntropyBase64Check(Filter):
12-
"""Check that candidate have Shanon Entropy > 3 (for HEX_CHARS or BASE36_CHARS) or > 4.5 (for BASE64_CHARS)."""
13-
14-
# If the value size is less than this value the entropy evaluation gives an imprecise result
15-
min_length = 12
8+
class ValueEntropyBase64Check(ValueEntropyBaseCheck):
9+
"""Base64 entropy check"""
1610

1711
def __init__(self, config: Config = None) -> None:
18-
pass
12+
super().__init__(config)
1913

2014
@staticmethod
15+
@cache
2116
def get_min_data_entropy(x: int) -> float:
2217
"""Returns minimal average entropy for size of random data. Precalculated round data is applied for speedup"""
23-
if 18 == x:
24-
y = 3.8
25-
elif 20 == x:
26-
y = 3.9
27-
elif 24 == x:
28-
y = 4.1
29-
elif 32 == x:
30-
y = 4.4
31-
elif ValueEntropyBase64Check.min_length <= x < 35:
32-
# logarithm base 2 - slow, but precise. Approximation does not exceed stdev
33-
y = 0.77 * math.log2(x) + 0.62
34-
elif 35 <= x < 60:
35-
y = ENTROPY_LIMIT_BASE64
36-
elif 60 <= x:
37-
# the entropy grows slowly after 60
38-
y = 5.0
18+
if 12 <= x < 18:
19+
y = 0.915 * math.log2(x) - 0.047
20+
elif 18 <= x < 35:
21+
y = 0.767 * math.log2(x) + 0.5677
22+
elif 35 <= x < 65:
23+
y = 0.944 * math.log2(x) - 0.009 * x - 0.04
24+
elif 65 <= x < 256:
25+
y = 0.621 * math.log2(x) - 0.003 * x + 1.54
26+
elif 256 <= x < 512:
27+
y = 5.77
28+
elif 512 <= x < 1024:
29+
y = 5.89
30+
elif 1024 <= x:
31+
y = 5.94
3932
else:
4033
y = 0
4134
return y
42-
43-
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
44-
"""Run filter checks on received credential candidate data 'line_data'.
45-
46-
Args:
47-
line_data: credential candidate data
48-
target: multiline target from which line data was obtained
49-
50-
Return:
51-
True, if need to filter candidate and False if left
52-
53-
"""
54-
if '-' in line_data.value or '_' in line_data.value:
55-
entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE64URL_CHARS.value)
56-
else:
57-
entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE64STD_CHARS.value)
58-
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(line_data.value))
59-
return min_entropy > entropy or 0 == min_entropy

0 commit comments

Comments
 (0)