Skip to content

Commit daedce2

Browse files
authored
Merge branch 'main' into version-up
2 parents 67356be + a7051b7 commit daedce2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2485
-2057
lines changed

.ci/benchmark.txt

Lines changed: 30 additions & 29 deletions
Large diffs are not rendered by default.

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 120
3-
extend-ignore = E203,E303,E131,E402
3+
extend-ignore = E402
44
per-file-ignores = __init__.py:F401

.github/workflows/benchmark.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
3232
with:
3333
repository: Samsung/CredData
34-
ref: 22b3f3789b3c2078d7dec12a6d210e11c1cc697c
34+
ref: 5c93f142f9c28c03ff35ee0862dc819b73cdb673
3535

3636
- name: Markup hashing
3737
run: |
@@ -87,7 +87,7 @@ jobs:
8787
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
8888
with:
8989
repository: Samsung/CredData
90-
ref: 22b3f3789b3c2078d7dec12a6d210e11c1cc697c
90+
ref: 5c93f142f9c28c03ff35ee0862dc819b73cdb673
9191

9292
- name: Markup hashing
9393
run: |
@@ -190,7 +190,7 @@ jobs:
190190
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
191191
with:
192192
repository: Samsung/CredData
193-
ref: 22b3f3789b3c2078d7dec12a6d210e11c1cc697c
193+
ref: 5c93f142f9c28c03ff35ee0862dc819b73cdb673
194194

195195
- name: Markup hashing
196196
run: |
@@ -378,7 +378,7 @@ jobs:
378378
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
379379
with:
380380
repository: Samsung/CredData
381-
ref: 22b3f3789b3c2078d7dec12a6d210e11c1cc697c
381+
ref: 5c93f142f9c28c03ff35ee0862dc819b73cdb673
382382

383383
- name: Markup hashing
384384
run: |

.github/workflows/check.yml

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -89,19 +89,19 @@ jobs:
8989

9090
- name: Analysing the code with pylint and minimum Python version 3.9
9191
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
92-
run: pylint --py-version=3.9 --errors-only credsweeper
92+
run: pylint --py-version=3.9 --verbose credsweeper
9393

9494
- name: Analysing the code with pylint and minimum Python version 3.10
9595
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
96-
run: pylint --py-version=3.10 --errors-only credsweeper
96+
run: pylint --py-version=3.10 --verbose credsweeper
9797

9898
- name: Analysing the code with pylint and minimum Python version 3.11
9999
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
100-
run: pylint --py-version=3.11 --errors-only credsweeper
100+
run: pylint --py-version=3.11 --verbose credsweeper
101101

102102
- name: Analysing the code with pylint and minimum Python version 3.12
103103
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
104-
run: pylint --py-version=3.12 --errors-only credsweeper
104+
run: pylint --py-version=3.12 --verbose credsweeper
105105

106106
# # # mypy
107107

@@ -120,13 +120,6 @@ jobs:
120120
run: |
121121
mypy --config-file .mypy.ini --python-version=3.12 credsweeper
122122
123-
# # # documentation
124-
125-
- name: Analysing the code with pylint for NEW missed docstrings of classes or functions
126-
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
127-
run: |
128-
pylint --disable=E,R,W1203,W0718,C0114,C0103,C0303,C0412,C0413,C0415,C0200,C0201,C0325 --verbose credsweeper
129-
130123
# # # Documentation check
131124

132125
- name: Test for creation sphinx documentations
@@ -138,20 +131,6 @@ jobs:
138131
cd source
139132
python -m sphinx -T -E -b html -d _build/doctrees -D language=en . ./_html
140133
141-
# # # yapf
142-
143-
- name: Check project style
144-
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
145-
run: |
146-
for f in credsweeper tests docs experiment; do
147-
yapf --style .style.yapf --recursive --in-place --parallel $f
148-
done
149-
if [ 0 -ne $(git ls-files -m | wc -l) ]; then
150-
git diff
151-
echo "<- difference how to apply the style"
152-
exit 1
153-
fi
154-
155134
# # # flake8
156135

157136
- name: Analysing the code with flake8
@@ -198,6 +177,20 @@ jobs:
198177
exit 1
199178
fi
200179
180+
# # # yapf
181+
182+
- name: Check project style
183+
if: ${{ always() && steps.setup_credsweeper.conclusion == 'success' }}
184+
run: |
185+
for f in credsweeper tests docs experiment; do
186+
yapf --style .style.yapf --recursive --in-place --parallel $f
187+
done
188+
if [ 0 -ne $(git ls-files -m | wc -l) ]; then
189+
git diff
190+
echo "<- difference how to apply the style"
191+
exit 1
192+
fi
193+
201194
# # # SECURITY.md check
202195

203196
- name: SECURITY.md check

.mypy.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
warn_return_any = True
55
warn_unused_configs = True
66
strict_optional = False
7+
check_untyped_defs = True
78

89
# Per-module options:
910

.pylintrc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@
22
max-line-length=120
33

44
extension-pkg-allow-list=lxml
5+
6+
[MESSAGES CONTROL]
7+
disable=R,W0718,W1203,C0415,C0413,C0103,C0114

credsweeper/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def logger_levels(log_level: str) -> str:
6363
Returns True if log_level UPPERCASE is one of keys
6464
"""
6565
val = log_level.upper()
66-
if any(val == i for i in Logger.LEVELS.keys()):
66+
if val in Logger.LEVELS:
6767
return val
6868
raise ArgumentTypeError(f"Log level provided: {log_level} -- must be one of: {' | '.join(Logger.LEVELS.keys())}")
6969

credsweeper/app.py

Lines changed: 21 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
from credsweeper.config import Config
1616
from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
1717
from credsweeper.deep_scanner.deep_scanner import DeepScanner
18+
from credsweeper.file_handler.content_provider import ContentProvider
1819
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
1920
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
2021
from credsweeper.file_handler.abstract_provider import AbstractProvider
2122
from credsweeper.file_handler.text_content_provider import TextContentProvider
2223
from credsweeper.scanner import Scanner
24+
from credsweeper.ml_model.ml_validator import MlValidator
2325
from credsweeper.utils import Util
2426

2527
logger = logging.getLogger(__name__)
@@ -94,7 +96,7 @@ def __init__(self,
9496
log_level: str - level for pool initializer according logging levels (UPPERCASE)
9597
9698
"""
97-
self.pool_count: int = int(pool_count) if int(pool_count) > 1 else 1
99+
self.pool_count: int = max(1, int(pool_count))
98100
if not (_severity := Severity.get(severity)):
99101
raise RuntimeError(f"Severity level provided: {severity}"
100102
f" -- must be one of: {' | '.join([i.value for i in Severity])}")
@@ -123,9 +125,9 @@ def __init__(self,
123125
self.ml_config = ml_config
124126
self.ml_model = ml_model
125127
self.ml_providers = ml_providers
126-
self.ml_validator = None
127128
self.__thrifty = thrifty
128129
self.__log_level = log_level
130+
self.__ml_validator: Optional[MlValidator] = None
129131

130132
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
131133

@@ -182,35 +184,22 @@ def _use_ml_validation(self) -> bool:
182184

183185
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
184186

185-
# the import cannot be done on top due
186-
# TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
187-
from credsweeper.ml_model import MlValidator
188-
189-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
190-
191187
@property
192188
def ml_validator(self) -> MlValidator:
193189
"""ml_validator getter"""
194-
from credsweeper.ml_model import MlValidator
195190
if not self.__ml_validator:
196-
self.__ml_validator: MlValidator = MlValidator(
191+
self.__ml_validator = MlValidator(
197192
threshold=self.ml_threshold, #
198193
ml_config=self.ml_config, #
199194
ml_model=self.ml_model, #
200195
ml_providers=self.ml_providers, #
201196
)
202-
assert self.__ml_validator, "self.__ml_validator was not initialized"
197+
if not self.__ml_validator:
198+
raise RuntimeError("MlValidator was not initialized!")
203199
return self.__ml_validator
204200

205201
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
206202

207-
@ml_validator.setter
208-
def ml_validator(self, _ml_validator: Optional[MlValidator]) -> None:
209-
"""ml_validator setter"""
210-
self.__ml_validator = _ml_validator
211-
212-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
213-
214203
@staticmethod
215204
def pool_initializer(log_kwargs) -> None:
216205
"""Ignore SIGINT in child processes."""
@@ -219,20 +208,6 @@ def pool_initializer(log_kwargs) -> None:
219208

220209
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
221210

222-
@property
223-
def config(self) -> Config:
224-
"""config getter"""
225-
return self.__config
226-
227-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
228-
229-
@config.setter
230-
def config(self, config: Config) -> None:
231-
"""config setter"""
232-
self.__config = config
233-
234-
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
235-
236211
def run(self, content_provider: AbstractProvider) -> int:
237212
"""Run an analysis of 'content_provider' object.
238213
@@ -241,9 +216,10 @@ def run(self, content_provider: AbstractProvider) -> int:
241216
242217
"""
243218
_empty_list: Sequence[Union[DiffContentProvider, TextContentProvider]] = []
244-
file_extractors: Sequence[Union[DiffContentProvider, TextContentProvider]] = \
245-
content_provider.get_scannable_files(self.config) if content_provider else _empty_list
246-
logger.info(f"Start Scanner for {len(file_extractors)} providers")
219+
file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
220+
if not file_extractors:
221+
logger.info(f"No scannable targets for {len(content_provider.paths)} paths")
222+
return 0
247223
self.scan(file_extractors)
248224
self.post_processing()
249225
# PatchesProvider has the attribute. Circular import error appears with using the isinstance
@@ -260,7 +236,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
260236
content_providers: file objects to scan
261237
262238
"""
263-
if 1 < self.pool_count:
239+
if 1 < self.pool_count and 1 < len(content_providers):
264240
self.__multi_jobs_scan(content_providers)
265241
else:
266242
self.__single_job_scan(content_providers)
@@ -269,6 +245,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
269245

270246
def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
271247
"""Performs scan in main thread"""
248+
logger.info(f"Scan for {len(content_providers)} providers")
272249
all_cred = self.files_scan(content_providers)
273250
self.credential_manager.set_credentials(all_cred)
274251

@@ -284,12 +261,14 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
284261
if "SILENCE" == self.__log_level:
285262
logging.addLevelName(60, "SILENCE")
286263
log_kwargs["level"] = self.__log_level
287-
with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
288-
initializer=self.pool_initializer,
264+
pool_count = min(self.pool_count, len(content_providers))
265+
logger.info(f"Scan in {pool_count} processes for {len(content_providers)} providers")
266+
with multiprocessing.get_context("spawn").Pool(processes=pool_count,
267+
initializer=CredSweeper.pool_initializer,
289268
initargs=(log_kwargs, )) as pool:
290269
try:
291-
for scan_results in pool.imap_unordered(self.files_scan, (content_providers[x::self.pool_count]
292-
for x in range(self.pool_count))):
270+
for scan_results in pool.imap_unordered(self.files_scan,
271+
(content_providers[x::pool_count] for x in range(pool_count))):
293272
for cred in scan_results:
294273
self.credential_manager.add_credential(cred)
295274
except KeyboardInterrupt:
@@ -301,9 +280,7 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
301280

302281
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
303282

304-
def files_scan(
305-
self, #
306-
content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
283+
def files_scan(self, content_providers: Sequence[ContentProvider]) -> List[Candidate]:
307284
"""Auxiliary method for scan one sequence"""
308285
all_cred: List[Candidate] = []
309286
for provider in content_providers:
@@ -316,7 +293,7 @@ def files_scan(
316293

317294
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
318295

319-
def file_scan(self, content_provider: Union[DiffContentProvider, TextContentProvider]) -> List[Candidate]:
296+
def file_scan(self, content_provider: ContentProvider) -> List[Candidate]:
320297
"""Run scanning of file from 'file_provider'.
321298
322299
Args:

credsweeper/common/constants.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,6 @@ class Chars(Enum):
9696
ASCII_PRINTABLE = string.printable
9797

9898

99-
ENTROPY_LIMIT_BASE64 = 4.5
100-
ENTROPY_LIMIT_BASE3x = 3
101-
102-
10399
class GroupType(Enum):
104100
"""Group type - used in Group constructor for load predefined set of filters"""
105101
KEYWORD = "keyword"
@@ -148,7 +144,8 @@ class DiffRowType(Enum):
148144
CHUNK_STEP_SIZE = CHUNK_SIZE - OVERLAP_SIZE
149145
# ML hunk size to limit of variable or value size and get substring near value
150146
ML_HUNK = 80
151-
""" values according https://docs.python.org/3/library/codecs.html """
147+
148+
# values according https://docs.python.org/3/library/codecs.html
152149
UTF_8 = "utf_8"
153150
UTF_16 = "utf_16"
154151
LATIN_1 = "latin_1"

credsweeper/credentials/candidate_key.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __eq__(self, other):
2424
return self.key == other.key
2525

2626
def __ne__(self, other):
27-
return not (self == other)
27+
return not bool(self == other)
2828

2929
def __repr__(self) -> str:
3030
return f"{self.key}:{self.__line}"

0 commit comments

Comments
 (0)